-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from maryamteimouri/python-scripts
Files added via upload
- Loading branch information
Showing
5 changed files
with
3,757 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,215 @@ | ||
{ | ||
"nbformat": 4, | ||
"nbformat_minor": 0, | ||
"metadata": { | ||
"colab": { | ||
"provenance": [] | ||
}, | ||
"kernelspec": { | ||
"name": "python3", | ||
"display_name": "Python 3" | ||
}, | ||
"language_info": { | ||
"name": "python" | ||
} | ||
}, | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "_QW_K_n7l---", | ||
"colab": { | ||
"base_uri": "https://localhost:8080/" | ||
}, | ||
"outputId": "7e9b713b-b44a-4ea3-f4af-0dd297f262c0" | ||
}, | ||
"outputs": [ | ||
{ | ||
"output_type": "stream", | ||
"name": "stdout", | ||
"text": [ | ||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.4/1.4 MB\u001b[0m \u001b[31m14.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | ||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m295.0/295.0 kB\u001b[0m \u001b[31m25.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | ||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.5/45.5 kB\u001b[0m \u001b[31m4.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | ||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m51.0/51.0 kB\u001b[0m \u001b[31m4.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | ||
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | ||
" Building wheel for ligo-segments (setup.py) ... \u001b[?25l\u001b[?25hdone\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"!pip install -q gwpy" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"%%capture\n", | ||
"!pip install numpy#==1.21.6\n", | ||
"!pip install spacy scikit-learn krippendorff\n", | ||
"!python -m spacy download en_core_web_sm\n", | ||
"!python -m spacy download fi_core_news_sm\n", | ||
"# Add other languages as needed" | ||
], | ||
"metadata": { | ||
"id": "aW1FFJyWrO3Y" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"%%capture\n", | ||
"!pip install nipype\n", | ||
"!pip install statsmodels" | ||
], | ||
"metadata": { | ||
"id": "ZO-s7PDVxfmg" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"from collections import defaultdict\n", | ||
"from statsmodels.stats.inter_rater import fleiss_kappa" | ||
], | ||
"metadata": { | ||
"id": "oZVcliq51C9n" | ||
}, | ||
"execution_count": null, | ||
"outputs": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"source": [ | ||
"import spacy\n", | ||
"import krippendorff\n", | ||
"import json\n", | ||
"\n", | ||
"# Load spaCy models for English and Finnish\n", | ||
"nlp_en = spacy.load(\"en_core_web_sm\")\n", | ||
"nlp_fi = spacy.load(\"fi_core_news_sm\")\n", | ||
"\n", | ||
"# Function to load annotations from JSONL file\n", | ||
"def load_annotations(file_path):\n", | ||
" annotations = []\n", | ||
" with open(file_path, 'r') as f:\n", | ||
" for line in f:\n", | ||
" annotations.append(json.loads(line))\n", | ||
" return annotations\n", | ||
"\n", | ||
"file1 = 'GPT3_Emotion.jsonl'\n", | ||
"file2 = 'Eero-Emotions.jsonl'\n", | ||
"# Load annotations for English and Finnish\n", | ||
"annotations_en = load_annotations(file1)\n", | ||
"annotations_fi = load_annotations(file2)\n", | ||
"\n", | ||
"# Function to tokenize and extract labels\n", | ||
"def process_annotations(annotations, nlp_model):\n", | ||
" token_label_pairs = []\n", | ||
" for annotation in annotations:\n", | ||
" doc = nlp_model(annotation['text'])\n", | ||
" tokens = [token.text for token in doc]\n", | ||
" labels = annotation['label']\n", | ||
" token_label_pairs.append((tokens, labels))\n", | ||
" return token_label_pairs\n", | ||
"\n", | ||
"# Process annotations for both languages\n", | ||
"token_label_pairs_en = process_annotations(annotations_en, nlp_en)\n", | ||
"token_label_pairs_fi = process_annotations(annotations_fi, nlp_fi)\n", | ||
"\n", | ||
"# Function to align token-label pairs\n", | ||
"def align_annotations(token_label_pairs_en, token_label_pairs_fi):\n", | ||
" aligned_labels_en = []\n", | ||
" aligned_labels_fi = []\n", | ||
"\n", | ||
" for (tokens_en, labels_en), (tokens_fi, labels_fi) in zip(token_label_pairs_en, token_label_pairs_fi):\n", | ||
"\n", | ||
" # Assume both texts are translations and can be aligned at sentence level\n", | ||
" if len(tokens_en) != len(tokens_fi):\n", | ||
" # If lengths differ, consider padding or merging as appropriate\n", | ||
" # Here, we'll assume we can pad the shorter sequence with \"O\" labels\n", | ||
" max_len = max(len(tokens_en), len(tokens_fi))\n", | ||
" if len(tokens_en) < max_len:\n", | ||
" labels_en.extend([[0,0,'O']] * (max_len - len(labels_en)))\n", | ||
" if len(labels_fi) < max_len:\n", | ||
" labels_fi.extend([[0,0,'O']] * (max_len - len(labels_fi)))\n", | ||
"\n", | ||
" aligned_labels_en.extend(labels_en)\n", | ||
" aligned_labels_fi.extend(labels_fi)\n", | ||
"\n", | ||
" return aligned_labels_en, aligned_labels_fi\n", | ||
"\n", | ||
"# Align the annotations\n", | ||
"aligned_labels_en, aligned_labels_fi = align_annotations(token_label_pairs_en, token_label_pairs_fi)\n", | ||
"\n", | ||
"# Combine labels for agreement calculation\n", | ||
"combined_labels = list(zip(aligned_labels_en, aligned_labels_fi))\n", | ||
"\n", | ||
"# Prepare data in the format expected by Fleiss' Kappa\n", | ||
"data = []\n", | ||
"for en_labels, fi_labels in combined_labels:\n", | ||
" data.append([label for label in en_labels] + [label for label in fi_labels])\n", | ||
"\n", | ||
"label_mapping = {'Joy':1, 'Sadness':2, 'Anger':3, 'Fear':4, 'Surprise':5,\n", | ||
" 'Disgust':6, 'Trust':7, 'Anticipation':8,\n", | ||
" 'joy':1, 'sadness':2, 'anger':3, 'fear':4, 'surprise':5,\n", | ||
" 'disgust':6, 'trust':7, 'anticipation':8}\n", | ||
"\n", | ||
"#label_mapping = {'Speaker 1': 1, 'S1': 1, 'S2': 2, 'Speaker 2': 2,\n", | ||
"# 'Instructor': 3, 'Instrutor': 3}\n", | ||
"\n", | ||
"#label_mapping = {'Speaker 1': 1,'Interviewee': 1,'interviewee': 1,\n", | ||
"# 'interviewer': 2, 'Interviewer': 2 } # Add more labels as needed\n", | ||
"\n", | ||
"\n", | ||
"data = []\n", | ||
"for en_labels, fi_labels in combined_labels:\n", | ||
" numerical_en = [label_mapping.get(label, -1) for label in en_labels] # Map labels to numbers, use -1 for unknown labels\n", | ||
" numerical_fi = [label_mapping.get(label, -1) for label in fi_labels]\n", | ||
" data.append(numerical_en + numerical_fi)\n", | ||
"\n", | ||
"# Prepare data in the format expected by NLTK's Agreement metrics\n", | ||
"formatted_data = []\n", | ||
"for i, (en_labels, fi_labels) in enumerate(zip(aligned_labels_en, aligned_labels_fi)):\n", | ||
" for j, (en_label, fi_label) in enumerate(zip(en_labels, fi_labels)):\n", | ||
" # Extract the label string directly if en_label and fi_label are lists or tuples\n", | ||
" en_label_str = en_label[2] if isinstance(en_label, (list, tuple)) else str(en_label)\n", | ||
" fi_label_str = fi_label[2] if isinstance(fi_label, (list, tuple)) else str(fi_label)\n", | ||
"\n", | ||
" formatted_data.append(('coder_en', f'item_{i}_{j}', en_label_str))\n", | ||
" formatted_data.append(('coder_fi', f'item_{i}_{j}', fi_label_str))\n", | ||
"\n", | ||
"from nltk.metrics import agreement\n", | ||
"task = agreement.AnnotationTask(formatted_data)\n", | ||
"print(file1, ', ', file2, ' : ' \"Kappa:\", task.kappa())\n", | ||
"\n", | ||
"# Calculate Fleiss' Kappahouse of dragons\n", | ||
"#kappa = fleiss_kappa(data)\n", | ||
"\n", | ||
"# print(f\"Fleiss' Kappa: {kappa}\")\n" | ||
], | ||
"metadata": { | ||
"colab": { | ||
"base_uri": "https://localhost:8080/" | ||
}, | ||
"id": "Ny99dYvl18__", | ||
"outputId": "e4f99acc-9a6d-45b9-bfb6-af54b61d1459" | ||
}, | ||
"execution_count": null, | ||
"outputs": [ | ||
{ | ||
"output_type": "stream", | ||
"name": "stdout", | ||
"text": [ | ||
"GPT3_Emotion.jsonl , Eero-Emotions.jsonl : Kappa: 0.8972155459286325\n" | ||
] | ||
} | ||
] | ||
} | ||
] | ||
} |
Oops, something went wrong.