From 7ea8d4b648d44ba4e32f6c61958577d1233b13f8 Mon Sep 17 00:00:00 2001 From: Christian A Date: Sat, 22 Jun 2024 10:54:00 +0000 Subject: [PATCH] Fix for Pipeline --- Group-8-Retrieval-System/Porter2Stemmer.ipynb | 220 +++--------------- 1 file changed, 26 insertions(+), 194 deletions(-) diff --git a/Group-8-Retrieval-System/Porter2Stemmer.ipynb b/Group-8-Retrieval-System/Porter2Stemmer.ipynb index 5dc9a8a..e244035 100644 --- a/Group-8-Retrieval-System/Porter2Stemmer.ipynb +++ b/Group-8-Retrieval-System/Porter2Stemmer.ipynb @@ -8,7 +8,7 @@ "source": [ "# You only need to execute this cell if you are using Google Golab.\n", "# If you use GitHub Codespaces, everything is already installed.\n", - "!pip3 install tira ir-datasets python-terrier" + "!pip3 install tira ir-datasets python-terrier nltk" ] }, { @@ -23,7 +23,7 @@ "\n", "import nltk\n", "from nltk.stem import PorterStemmer\n", - "nltk.download('punkt')\n", + "!nltk.download('punkt')\n", "\n", "import pyterrier as pt\n", "import pandas as pd" @@ -76,15 +76,6 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "stemmed_topics" - ] - }, - { - "cell_type": "code", - "execution_count": 83, - "metadata": {}, - "outputs": [], "source": [ "# Define the retrieval pipeline with BM25\n", "bm25 = pt.BatchRetrieve(index, wmodel=\"BM25\")\n", @@ -95,7 +86,7 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -104,203 +95,44 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The run file is normalized outside the TIRA sandbox, I will store it at \"../runs\".\n", - "Done. run file is stored under \"../runs/run.txt\".\n" - ] - } - ], + "outputs": [], "source": [ "persist_and_normalize_run(run, system_name='bm25-baseline', default_output='../runs')" ] }, { "cell_type": "code", - "execution_count": 86, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "There are multiple query fields available: ('text', 'title', 'query', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namemaprecip_rankP_1000map +map -map p-valuerecip_rank +recip_rank -recip_rank p-valueP_1000 +P_1000 -P_1000 p-value
0BM250.2623110.5798770.016191NoneNoneNoneNoneNoneNoneNoneNoneNone
\n", - "
" - ], - "text/plain": [ - " name map recip_rank P_1000 map + map - map p-value recip_rank + \\\n", - "0 BM25 0.262311 0.579877 0.016191 None None None None \n", - "\n", - " recip_rank - recip_rank p-value P_1000 + P_1000 - P_1000 p-value \n", - "0 None None None None None " - ] - }, - "execution_count": 86, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "#Localtest\n", - "pt.Experiment(\n", - " [bm25], \n", - " pt_dataset.get_topics(), \n", - " pt_dataset.get_qrels(), \n", - " eval_metrics=['P_1000', 'map', 'recip_rank'],\n", - " names=['BM25'],\n", - " baseline=0\n", - " )" + "# #Localtest\n", + "# pt.Experiment(\n", + "# [bm25], \n", + "# pt_dataset.get_topics(), \n", + "# pt_dataset.get_qrels(), \n", + "# eval_metrics=['P_1000', 'map', 'recip_rank'],\n", + "# names=['BM25'],\n", + "# baseline=0\n", + "# )" ] }, { "cell_type": "code", - "execution_count": 87, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namemaprecip_rankP_1000map +map -map p-valuerecip_rank +recip_rank -recip_rank p-valueP_1000 +P_1000 -P_1000 p-value
0BM25 + Porter2 Stemmer0.2433770.4984910.015456NoneNoneNoneNoneNoneNoneNoneNoneNone
\n", - "
" - ], - "text/plain": [ - " name map recip_rank P_1000 map + map - \\\n", - "0 BM25 + Porter2 Stemmer 0.243377 0.498491 0.015456 None None \n", - "\n", - " map p-value recip_rank + recip_rank - recip_rank p-value P_1000 + P_1000 - \\\n", - "0 None None None None None None \n", - "\n", - " P_1000 p-value \n", - "0 None " - ] - }, - "execution_count": 87, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "pt.Experiment(\n", - " [bm25], \n", - " df_stemmed_topics, \n", - " pt_dataset.get_qrels(), \n", - " eval_metrics=['P_1000', 'map', 'recip_rank'],\n", - " names=['BM25 + Porter2 Stemmer'],\n", - " baseline=0\n", - ")" + "# pt.Experiment(\n", + "# [bm25], \n", + "# df_stemmed_topics, \n", + "# pt_dataset.get_qrels(), \n", + "# eval_metrics=['P_1000', 'map', 'recip_rank'],\n", + "# names=['BM25 + Porter2 Stemmer'],\n", + "# baseline=0\n", + "# )" ] } ],