Skip to content

Commit

Permalink
Merge pull request #1 from maryamteimouri/python-scripts
Browse files Browse the repository at this point in the history
Files added via upload
  • Loading branch information
maryamteimouri authored Jul 18, 2024
2 parents e11b341 + 57858e7 commit 46e12b5
Show file tree
Hide file tree
Showing 5 changed files with 3,757 additions and 0 deletions.
215 changes: 215 additions & 0 deletions Ann_Agreement.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "_QW_K_n7l---",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "7e9b713b-b44a-4ea3-f4af-0dd297f262c0"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.4/1.4 MB\u001b[0m \u001b[31m14.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m295.0/295.0 kB\u001b[0m \u001b[31m25.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.5/45.5 kB\u001b[0m \u001b[31m4.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m51.0/51.0 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Building wheel for ligo-segments (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
]
}
],
"source": [
"!pip install -q gwpy"
]
},
{
"cell_type": "code",
"source": [
"%%capture\n",
"!pip install numpy#==1.21.6\n",
"!pip install spacy scikit-learn krippendorff\n",
"!python -m spacy download en_core_web_sm\n",
"!python -m spacy download fi_core_news_sm\n",
"# Add other languages as needed"
],
"metadata": {
"id": "aW1FFJyWrO3Y"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"%%capture\n",
"!pip install nipype\n",
"!pip install statsmodels"
],
"metadata": {
"id": "ZO-s7PDVxfmg"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from collections import defaultdict\n",
"from statsmodels.stats.inter_rater import fleiss_kappa"
],
"metadata": {
"id": "oZVcliq51C9n"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"import spacy\n",
"import krippendorff\n",
"import json\n",
"\n",
"# Load spaCy models for English and Finnish\n",
"nlp_en = spacy.load(\"en_core_web_sm\")\n",
"nlp_fi = spacy.load(\"fi_core_news_sm\")\n",
"\n",
"# Function to load annotations from JSONL file\n",
"def load_annotations(file_path):\n",
" annotations = []\n",
" with open(file_path, 'r') as f:\n",
" for line in f:\n",
" annotations.append(json.loads(line))\n",
" return annotations\n",
"\n",
"file1 = 'GPT3_Emotion.jsonl'\n",
"file2 = 'Eero-Emotions.jsonl'\n",
"# Load annotations for English and Finnish\n",
"annotations_en = load_annotations(file1)\n",
"annotations_fi = load_annotations(file2)\n",
"\n",
"# Function to tokenize and extract labels\n",
"def process_annotations(annotations, nlp_model):\n",
" token_label_pairs = []\n",
" for annotation in annotations:\n",
" doc = nlp_model(annotation['text'])\n",
" tokens = [token.text for token in doc]\n",
" labels = annotation['label']\n",
" token_label_pairs.append((tokens, labels))\n",
" return token_label_pairs\n",
"\n",
"# Process annotations for both languages\n",
"token_label_pairs_en = process_annotations(annotations_en, nlp_en)\n",
"token_label_pairs_fi = process_annotations(annotations_fi, nlp_fi)\n",
"\n",
"# Function to align token-label pairs\n",
"def align_annotations(token_label_pairs_en, token_label_pairs_fi):\n",
" aligned_labels_en = []\n",
" aligned_labels_fi = []\n",
"\n",
" for (tokens_en, labels_en), (tokens_fi, labels_fi) in zip(token_label_pairs_en, token_label_pairs_fi):\n",
"\n",
" # Assume both texts are translations and can be aligned at sentence level\n",
" if len(tokens_en) != len(tokens_fi):\n",
" # If lengths differ, consider padding or merging as appropriate\n",
" # Here, we'll assume we can pad the shorter sequence with \"O\" labels\n",
" max_len = max(len(tokens_en), len(tokens_fi))\n",
" if len(tokens_en) < max_len:\n",
" labels_en.extend([[0,0,'O']] * (max_len - len(labels_en)))\n",
" if len(labels_fi) < max_len:\n",
" labels_fi.extend([[0,0,'O']] * (max_len - len(labels_fi)))\n",
"\n",
" aligned_labels_en.extend(labels_en)\n",
" aligned_labels_fi.extend(labels_fi)\n",
"\n",
" return aligned_labels_en, aligned_labels_fi\n",
"\n",
"# Align the annotations\n",
"aligned_labels_en, aligned_labels_fi = align_annotations(token_label_pairs_en, token_label_pairs_fi)\n",
"\n",
"# Combine labels for agreement calculation\n",
"combined_labels = list(zip(aligned_labels_en, aligned_labels_fi))\n",
"\n",
"# Prepare data in the format expected by Fleiss' Kappa\n",
"data = []\n",
"for en_labels, fi_labels in combined_labels:\n",
" data.append([label for label in en_labels] + [label for label in fi_labels])\n",
"\n",
"label_mapping = {'Joy':1, 'Sadness':2, 'Anger':3, 'Fear':4, 'Surprise':5,\n",
" 'Disgust':6, 'Trust':7, 'Anticipation':8,\n",
" 'joy':1, 'sadness':2, 'anger':3, 'fear':4, 'surprise':5,\n",
" 'disgust':6, 'trust':7, 'anticipation':8}\n",
"\n",
"#label_mapping = {'Speaker 1': 1, 'S1': 1, 'S2': 2, 'Speaker 2': 2,\n",
"# 'Instructor': 3, 'Instrutor': 3}\n",
"\n",
"#label_mapping = {'Speaker 1': 1,'Interviewee': 1,'interviewee': 1,\n",
"# 'interviewer': 2, 'Interviewer': 2 } # Add more labels as needed\n",
"\n",
"\n",
"data = []\n",
"for en_labels, fi_labels in combined_labels:\n",
" numerical_en = [label_mapping.get(label, -1) for label in en_labels] # Map labels to numbers, use -1 for unknown labels\n",
" numerical_fi = [label_mapping.get(label, -1) for label in fi_labels]\n",
" data.append(numerical_en + numerical_fi)\n",
"\n",
"# Prepare data in the format expected by NLTK's Agreement metrics\n",
"formatted_data = []\n",
"for i, (en_labels, fi_labels) in enumerate(zip(aligned_labels_en, aligned_labels_fi)):\n",
" for j, (en_label, fi_label) in enumerate(zip(en_labels, fi_labels)):\n",
" # Extract the label string directly if en_label and fi_label are lists or tuples\n",
" en_label_str = en_label[2] if isinstance(en_label, (list, tuple)) else str(en_label)\n",
" fi_label_str = fi_label[2] if isinstance(fi_label, (list, tuple)) else str(fi_label)\n",
"\n",
" formatted_data.append(('coder_en', f'item_{i}_{j}', en_label_str))\n",
" formatted_data.append(('coder_fi', f'item_{i}_{j}', fi_label_str))\n",
"\n",
"from nltk.metrics import agreement\n",
"task = agreement.AnnotationTask(formatted_data)\n",
"print(file1, ', ', file2, ' : ' \"Kappa:\", task.kappa())\n",
"\n",
"# Calculate Fleiss' Kappahouse of dragons\n",
"#kappa = fleiss_kappa(data)\n",
"\n",
"# print(f\"Fleiss' Kappa: {kappa}\")\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Ny99dYvl18__",
"outputId": "e4f99acc-9a6d-45b9-bfb6-af54b61d1459"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"GPT3_Emotion.jsonl , Eero-Emotions.jsonl : Kappa: 0.8972155459286325\n"
]
}
]
}
]
}
Loading

0 comments on commit 46e12b5

Please sign in to comment.