Merge pull request #1 from maryamteimouri/python-scripts

Files added via upload
maryamteimouri · Jul 18, 2024 · 46e12b5 · 46e12b5
2 parents e11b341 + 57858e7
commit 46e12b5
Show file tree

Hide file tree

Showing 5 changed files with 3,757 additions and 0 deletions.
diff --git a/Ann_Agreement.ipynb b/Ann_Agreement.ipynb
@@ -0,0 +1,215 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "_QW_K_n7l---",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "7e9b713b-b44a-4ea3-f4af-0dd297f262c0"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.4/1.4 MB\u001b[0m \u001b[31m14.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m295.0/295.0 kB\u001b[0m \u001b[31m25.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.5/45.5 kB\u001b[0m \u001b[31m4.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m51.0/51.0 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Building wheel for ligo-segments (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
+          ]
+        }
+      ],
+      "source": [
+        "!pip install -q gwpy"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%%capture\n",
+        "!pip install numpy#==1.21.6\n",
+        "!pip install spacy scikit-learn krippendorff\n",
+        "!python -m spacy download en_core_web_sm\n",
+        "!python -m spacy download fi_core_news_sm\n",
+        "# Add other languages as needed"
+      ],
+      "metadata": {
+        "id": "aW1FFJyWrO3Y"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%%capture\n",
+        "!pip install nipype\n",
+        "!pip install statsmodels"
+      ],
+      "metadata": {
+        "id": "ZO-s7PDVxfmg"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from collections import defaultdict\n",
+        "from statsmodels.stats.inter_rater import fleiss_kappa"
+      ],
+      "metadata": {
+        "id": "oZVcliq51C9n"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import spacy\n",
+        "import krippendorff\n",
+        "import json\n",
+        "\n",
+        "# Load spaCy models for English and Finnish\n",
+        "nlp_en = spacy.load(\"en_core_web_sm\")\n",
+        "nlp_fi = spacy.load(\"fi_core_news_sm\")\n",
+        "\n",
+        "# Function to load annotations from JSONL file\n",
+        "def load_annotations(file_path):\n",
+        "    annotations = []\n",
+        "    with open(file_path, 'r') as f:\n",
+        "        for line in f:\n",
+        "            annotations.append(json.loads(line))\n",
+        "    return annotations\n",
+        "\n",
+        "file1 = 'GPT3_Emotion.jsonl'\n",
+        "file2 = 'Eero-Emotions.jsonl'\n",
+        "# Load annotations for English and Finnish\n",
+        "annotations_en = load_annotations(file1)\n",
+        "annotations_fi = load_annotations(file2)\n",
+        "\n",
+        "# Function to tokenize and extract labels\n",
+        "def process_annotations(annotations, nlp_model):\n",
+        "    token_label_pairs = []\n",
+        "    for annotation in annotations:\n",
+        "        doc = nlp_model(annotation['text'])\n",
+        "        tokens = [token.text for token in doc]\n",
+        "        labels = annotation['label']\n",
+        "        token_label_pairs.append((tokens, labels))\n",
+        "    return token_label_pairs\n",
+        "\n",
+        "# Process annotations for both languages\n",
+        "token_label_pairs_en = process_annotations(annotations_en, nlp_en)\n",
+        "token_label_pairs_fi = process_annotations(annotations_fi, nlp_fi)\n",
+        "\n",
+        "# Function to align token-label pairs\n",
+        "def align_annotations(token_label_pairs_en, token_label_pairs_fi):\n",
+        "    aligned_labels_en = []\n",
+        "    aligned_labels_fi = []\n",
+        "\n",
+        "    for (tokens_en, labels_en), (tokens_fi, labels_fi) in zip(token_label_pairs_en, token_label_pairs_fi):\n",
+        "\n",
+        "        # Assume both texts are translations and can be aligned at sentence level\n",
+        "        if len(tokens_en) != len(tokens_fi):\n",
+        "            # If lengths differ, consider padding or merging as appropriate\n",
+        "            # Here, we'll assume we can pad the shorter sequence with \"O\" labels\n",
+        "            max_len = max(len(tokens_en), len(tokens_fi))\n",
+        "            if len(tokens_en) < max_len:\n",
+        "                labels_en.extend([[0,0,'O']] * (max_len - len(labels_en)))\n",
+        "            if len(labels_fi) < max_len:\n",
+        "                labels_fi.extend([[0,0,'O']] * (max_len - len(labels_fi)))\n",
+        "\n",
+        "        aligned_labels_en.extend(labels_en)\n",
+        "        aligned_labels_fi.extend(labels_fi)\n",
+        "\n",
+        "    return aligned_labels_en, aligned_labels_fi\n",
+        "\n",
+        "# Align the annotations\n",
+        "aligned_labels_en, aligned_labels_fi = align_annotations(token_label_pairs_en, token_label_pairs_fi)\n",
+        "\n",
+        "# Combine labels for agreement calculation\n",
+        "combined_labels = list(zip(aligned_labels_en, aligned_labels_fi))\n",
+        "\n",
+        "# Prepare data in the format expected by Fleiss' Kappa\n",
+        "data = []\n",
+        "for en_labels, fi_labels in combined_labels:\n",
+        "    data.append([label for label in en_labels] + [label for label in fi_labels])\n",
+        "\n",
+        "label_mapping = {'Joy':1, 'Sadness':2, 'Anger':3, 'Fear':4, 'Surprise':5,\n",
+        "                 'Disgust':6, 'Trust':7, 'Anticipation':8,\n",
+        "                 'joy':1, 'sadness':2, 'anger':3, 'fear':4, 'surprise':5,\n",
+        "                 'disgust':6, 'trust':7, 'anticipation':8}\n",
+        "\n",
+        "#label_mapping = {'Speaker 1': 1, 'S1': 1, 'S2': 2, 'Speaker 2': 2,\n",
+        "#                 'Instructor': 3, 'Instrutor': 3}\n",
+        "\n",
+        "#label_mapping = {'Speaker 1': 1,'Interviewee': 1,'interviewee': 1,\n",
+        "#                 'interviewer': 2, 'Interviewer': 2 }  # Add more labels as needed\n",
+        "\n",
+        "\n",
+        "data = []\n",
+        "for en_labels, fi_labels in combined_labels:\n",
+        "    numerical_en = [label_mapping.get(label, -1) for label in en_labels] # Map labels to numbers, use -1 for unknown labels\n",
+        "    numerical_fi = [label_mapping.get(label, -1) for label in fi_labels]\n",
+        "    data.append(numerical_en + numerical_fi)\n",
+        "\n",
+        "# Prepare data in the format expected by NLTK's Agreement metrics\n",
+        "formatted_data = []\n",
+        "for i, (en_labels, fi_labels) in enumerate(zip(aligned_labels_en, aligned_labels_fi)):\n",
+        "    for j, (en_label, fi_label) in enumerate(zip(en_labels, fi_labels)):\n",
+        "        # Extract the label string directly if en_label and fi_label are lists or tuples\n",
+        "        en_label_str = en_label[2] if isinstance(en_label, (list, tuple)) else str(en_label)\n",
+        "        fi_label_str = fi_label[2] if isinstance(fi_label, (list, tuple)) else str(fi_label)\n",
+        "\n",
+        "        formatted_data.append(('coder_en', f'item_{i}_{j}', en_label_str))\n",
+        "        formatted_data.append(('coder_fi', f'item_{i}_{j}', fi_label_str))\n",
+        "\n",
+        "from nltk.metrics import agreement\n",
+        "task = agreement.AnnotationTask(formatted_data)\n",
+        "print(file1, ', ', file2, ' : ' \"Kappa:\", task.kappa())\n",
+        "\n",
+        "# Calculate Fleiss' Kappahouse of dragons\n",
+        "#kappa = fleiss_kappa(data)\n",
+        "\n",
+        "# print(f\"Fleiss' Kappa: {kappa}\")\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "Ny99dYvl18__",
+        "outputId": "e4f99acc-9a6d-45b9-bfb6-af54b61d1459"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "GPT3_Emotion.jsonl ,  Eero-Emotions.jsonl  : Kappa: 0.8972155459286325\n"
+          ]
+        }
+      ]
+    }
+  ]
+}