Skip to content

Commit

Permalink
Bugfixing
Browse files Browse the repository at this point in the history
  • Loading branch information
Christian A committed Jun 25, 2024
1 parent 3789e37 commit 1d109c5
Showing 1 changed file with 145 additions and 67 deletions.
212 changes: 145 additions & 67 deletions Group-8-Retrieval-System/Advanced_Retrieval_System.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,84 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 30,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: tira in /usr/local/lib/python3.10/dist-packages (0.0.132)\n",
"Requirement already satisfied: ir-datasets in /usr/local/lib/python3.10/dist-packages (0.5.5)\n",
"Requirement already satisfied: python-terrier in /usr/local/lib/python3.10/dist-packages (0.10.0)\n",
"Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (3.8.1)\n",
"Requirement already satisfied: requests==2.*,>=2.26 in /usr/local/lib/python3.10/dist-packages (from tira) (2.31.0)\n",
"Requirement already satisfied: docker==6.*,>=6.0.0 in /usr/local/lib/python3.10/dist-packages (from tira) (6.1.3)\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from tira) (4.66.1)\n",
"Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from tira) (2.1.3)\n",
"Requirement already satisfied: packaging>=14.0 in /usr/local/lib/python3.10/dist-packages (from docker==6.*,>=6.0.0->tira) (23.2)\n",
"Requirement already satisfied: websocket-client>=0.32.0 in /usr/local/lib/python3.10/dist-packages (from docker==6.*,>=6.0.0->tira) (1.7.0)\n",
"Requirement already satisfied: urllib3>=1.26.0 in /usr/local/lib/python3.10/dist-packages (from docker==6.*,>=6.0.0->tira) (2.1.0)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests==2.*,>=2.26->tira) (2023.11.17)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests==2.*,>=2.26->tira) (3.3.2)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests==2.*,>=2.26->tira) (3.6)\n",
"Requirement already satisfied: trec-car-tools>=2.5.4 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (2.6)\n",
"Requirement already satisfied: warc3-wet>=0.2.3 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (0.2.3)\n",
"Requirement already satisfied: lxml>=4.5.2 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (4.9.3)\n",
"Requirement already satisfied: pyyaml>=5.3.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (6.0.1)\n",
"Requirement already satisfied: beautifulsoup4>=4.4.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (4.12.2)\n",
"Requirement already satisfied: inscriptis>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (2.3.2)\n",
"Requirement already satisfied: lz4>=3.1.10 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (4.3.2)\n",
"Requirement already satisfied: zlib-state>=0.1.3 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (0.1.6)\n",
"Requirement already satisfied: unlzw3>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (0.2.2)\n",
"Requirement already satisfied: pyautocorpus>=0.1.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (0.1.12)\n",
"Requirement already satisfied: ijson>=3.1.3 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (3.2.3)\n",
"Requirement already satisfied: numpy>=1.18.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (1.26.2)\n",
"Requirement already satisfied: warc3-wet-clueweb09>=0.2.5 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (0.2.5)\n",
"Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.3.2)\n",
"Requirement already satisfied: wget in /usr/local/lib/python3.10/dist-packages (from python-terrier) (3.2)\n",
"Requirement already satisfied: nptyping==1.4.4 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.4.4)\n",
"Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.3.2)\n",
"Requirement already satisfied: ir-measures>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.3.3)\n",
"Requirement already satisfied: dill in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.3.7)\n",
"Requirement already satisfied: chest in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.2.3)\n",
"Requirement already satisfied: statsmodels in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.14.0)\n",
"Requirement already satisfied: matchpy in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.5.5)\n",
"Requirement already satisfied: more-itertools in /usr/local/lib/python3.10/dist-packages (from python-terrier) (10.1.0)\n",
"Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.11.4)\n",
"Requirement already satisfied: pyjnius>=1.4.2 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.6.1)\n",
"Requirement already satisfied: deprecated in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.2.14)\n",
"Requirement already satisfied: pytrec-eval-terrier>=0.5.3 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.5.6)\n",
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (3.1.2)\n",
"Requirement already satisfied: typish>=1.7.0 in /usr/local/lib/python3.10/dist-packages (from nptyping==1.4.4->python-terrier) (1.9.3)\n",
"Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk) (8.1.7)\n",
"Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk) (2024.5.15)\n",
"Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4>=4.4.1->ir-datasets) (2.5)\n",
"Requirement already satisfied: cwl-eval>=1.0.10 in /usr/local/lib/python3.10/dist-packages (from ir-measures>=0.3.1->python-terrier) (1.0.12)\n",
"Requirement already satisfied: cbor>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from trec-car-tools>=2.5.4->ir-datasets) (1.0.0)\n",
"Requirement already satisfied: heapdict in /usr/local/lib/python3.10/dist-packages (from chest->python-terrier) (1.0.1)\n",
"Requirement already satisfied: wrapt<2,>=1.10 in /usr/local/lib/python3.10/dist-packages (from deprecated->python-terrier) (1.16.0)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->python-terrier) (2.1.3)\n",
"Requirement already satisfied: multiset<3.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from matchpy->python-terrier) (2.1.1)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->tira) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->tira) (2023.3.post1)\n",
"Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas->tira) (2023.3)\n",
"Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->python-terrier) (3.2.0)\n",
"Requirement already satisfied: patsy>=0.5.2 in /usr/local/lib/python3.10/dist-packages (from statsmodels->python-terrier) (0.5.4)\n",
"Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from patsy>=0.5.2->statsmodels->python-terrier) (1.16.0)\n",
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
"\u001b[0m"
]
}
],
"source": [
"# Install necessary libraries\n",
"!pip3 install tira ir-datasets python-terrier nltk"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -37,7 +104,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -48,7 +115,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -59,7 +126,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -80,28 +147,28 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"# Display some example data\n",
"topics = dataset.get_topics('text')\n",
"qrels = dataset.get_qrels()\n"
"#topics = dataset.get_topics('text')\n",
"#qrels = dataset.get_qrels()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"print(topics.head())\n",
"print(qrels.head())"
"#print(topics.head())\n",
"#print(qrels.head())"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -119,7 +186,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -129,7 +196,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -146,34 +213,34 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"# Evaluate BM25\n",
"bm25_results = bm25.transform(topics)\n",
"bm25_metrics = pt.Experiment(\n",
" [bm25],\n",
" topics,\n",
" qrels,\n",
" eval_metrics=[\"map\", \"ndcg\", \"recip_rank\"]\n",
")"
"# # Evaluate BM25\n",
"# bm25_results = bm25.transform(topics)\n",
"# bm25_metrics = pt.Experiment(\n",
"# [bm25],\n",
"# dataset.get_topics(), \n",
"# dataset.get_qrels(), \n",
"# eval_metrics=[\"map\", \"ndcg\", \"recip_rank\"]\n",
"# )"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"# Evaluate TF-IDF\n",
"tfidf_results = tfidf.transform(topics)\n",
"tfidf_metrics = pt.Experiment(\n",
" [tfidf],\n",
" topics,\n",
" qrels,\n",
" eval_metrics=[\"map\", \"ndcg\", \"recip_rank\"]\n",
")"
"# # Evaluate TF-IDF\n",
"# tfidf_results = tfidf.transform(topics)\n",
"# tfidf_metrics = pt.Experiment(\n",
"# [tfidf],\n",
"# topics,\n",
"# qrels,\n",
"# eval_metrics=[\"map\", \"ndcg\", \"recip_rank\"]\n",
"# )"
]
},
{
Expand All @@ -185,9 +252,20 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 42,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The run file is normalized outside the TIRA sandbox, I will store it at \"../runs\".\n",
"Done. run file is stored under \"../runs/run.txt\".\n",
"The run file is normalized outside the TIRA sandbox, I will store it at \"../runs\".\n",
"Done. run file is stored under \"../runs/run.txt\".\n"
]
}
],
"source": [
"run = bm25_results\n",
"persist_and_normalize_run(run, system_name='bm25-baseline', default_output='../runs')\n",
Expand All @@ -198,13 +276,13 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"# Summarize findings\n",
"print(\"BM25 Metrics:\", bm25_metrics)\n",
"print(\"TF-IDF Metrics:\", tfidf_metrics)"
"# # Summarize findings\n",
"# print(\"BM25 Metrics:\", bm25_metrics)\n",
"# print(\"TF-IDF Metrics:\", tfidf_metrics)"
]
},
{
Expand All @@ -223,30 +301,30 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"# Local test of BM25\n",
"bm25_experiment = pt.Experiment(\n",
" [bm25],\n",
" topics,\n",
" qrels,\n",
" eval_metrics=['P_1000', 'map', 'recip_rank'],\n",
" names=['BM25'],\n",
" baseline=0\n",
")"
"# # Local test of BM25\n",
"# bm25_experiment = pt.Experiment(\n",
"# [bm25],\n",
"# topics,\n",
"# qrels,\n",
"# eval_metrics=['P_1000', 'map', 'recip_rank'],\n",
"# names=['BM25'],\n",
"# baseline=0\n",
"# )"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"# Display BM25 results\n",
"print(\"BM25 Experiment Results\")\n",
"print(bm25_experiment)"
"# # Display BM25 results\n",
"# print(\"BM25 Experiment Results\")\n",
"# print(bm25_experiment)"
]
},
{
Expand All @@ -258,30 +336,30 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"# Local test of TF-IDF\n",
"tfidf_experiment = pt.Experiment(\n",
" [tfidf],\n",
" topics,\n",
" qrels,\n",
" eval_metrics=['P_1000', 'map', 'recip_rank'],\n",
" names=['TF-IDF'],\n",
" baseline=0\n",
")"
"# # Local test of TF-IDF\n",
"# tfidf_experiment = pt.Experiment(\n",
"# [tfidf],\n",
"# topics,\n",
"# qrels,\n",
"# eval_metrics=['P_1000', 'map', 'recip_rank'],\n",
"# names=['TF-IDF'],\n",
"# baseline=0\n",
"# )"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"# Display TF-IDF results\n",
"print(\"TF-IDF Experiment Results\")\n",
"print(tfidf_experiment)"
"# # Display TF-IDF results\n",
"# print(\"TF-IDF Experiment Results\")\n",
"# print(tfidf_experiment)"
]
}
],
Expand Down

0 comments on commit 1d109c5

Please sign in to comment.