diff --git a/Group-8-Retrieval-System/Advanced_Retrieval_System.ipynb b/Group-8-Retrieval-System/Advanced_Retrieval_System.ipynb index c0111bc..30caee1 100644 --- a/Group-8-Retrieval-System/Advanced_Retrieval_System.ipynb +++ b/Group-8-Retrieval-System/Advanced_Retrieval_System.ipynb @@ -9,9 +9,76 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: tira in /usr/local/lib/python3.10/dist-packages (0.0.132)\n", + "Requirement already satisfied: ir-datasets in /usr/local/lib/python3.10/dist-packages (0.5.5)\n", + "Requirement already satisfied: python-terrier in /usr/local/lib/python3.10/dist-packages (0.10.0)\n", + "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (3.8.1)\n", + "Requirement already satisfied: requests==2.*,>=2.26 in /usr/local/lib/python3.10/dist-packages (from tira) (2.31.0)\n", + "Requirement already satisfied: docker==6.*,>=6.0.0 in /usr/local/lib/python3.10/dist-packages (from tira) (6.1.3)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from tira) (4.66.1)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from tira) (2.1.3)\n", + "Requirement already satisfied: packaging>=14.0 in /usr/local/lib/python3.10/dist-packages (from docker==6.*,>=6.0.0->tira) (23.2)\n", + "Requirement already satisfied: websocket-client>=0.32.0 in /usr/local/lib/python3.10/dist-packages (from docker==6.*,>=6.0.0->tira) (1.7.0)\n", + "Requirement already satisfied: urllib3>=1.26.0 in /usr/local/lib/python3.10/dist-packages (from docker==6.*,>=6.0.0->tira) (2.1.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests==2.*,>=2.26->tira) (2023.11.17)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests==2.*,>=2.26->tira) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests==2.*,>=2.26->tira) (3.6)\n", + "Requirement already satisfied: trec-car-tools>=2.5.4 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (2.6)\n", + "Requirement already satisfied: warc3-wet>=0.2.3 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (0.2.3)\n", + "Requirement already satisfied: lxml>=4.5.2 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (4.9.3)\n", + "Requirement already satisfied: pyyaml>=5.3.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (6.0.1)\n", + "Requirement already satisfied: beautifulsoup4>=4.4.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (4.12.2)\n", + "Requirement already satisfied: inscriptis>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (2.3.2)\n", + "Requirement already satisfied: lz4>=3.1.10 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (4.3.2)\n", + "Requirement already satisfied: zlib-state>=0.1.3 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (0.1.6)\n", + "Requirement already satisfied: unlzw3>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (0.2.2)\n", + "Requirement already satisfied: pyautocorpus>=0.1.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (0.1.12)\n", + "Requirement already satisfied: ijson>=3.1.3 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (3.2.3)\n", + "Requirement already satisfied: numpy>=1.18.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (1.26.2)\n", + "Requirement already satisfied: warc3-wet-clueweb09>=0.2.5 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (0.2.5)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.3.2)\n", + "Requirement already satisfied: wget in /usr/local/lib/python3.10/dist-packages (from python-terrier) (3.2)\n", + "Requirement already satisfied: nptyping==1.4.4 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.4.4)\n", + "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.3.2)\n", + "Requirement already satisfied: ir-measures>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.3.3)\n", + "Requirement already satisfied: dill in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.3.7)\n", + "Requirement already satisfied: chest in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.2.3)\n", + "Requirement already satisfied: statsmodels in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.14.0)\n", + "Requirement already satisfied: matchpy in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.5.5)\n", + "Requirement already satisfied: more-itertools in /usr/local/lib/python3.10/dist-packages (from python-terrier) (10.1.0)\n", + "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.11.4)\n", + "Requirement already satisfied: pyjnius>=1.4.2 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.6.1)\n", + "Requirement already satisfied: deprecated in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.2.14)\n", + "Requirement already satisfied: pytrec-eval-terrier>=0.5.3 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.5.6)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (3.1.2)\n", + "Requirement already satisfied: typish>=1.7.0 in /usr/local/lib/python3.10/dist-packages (from nptyping==1.4.4->python-terrier) (1.9.3)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk) (8.1.7)\n", + "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk) (2024.5.15)\n", + "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4>=4.4.1->ir-datasets) (2.5)\n", + "Requirement already satisfied: cwl-eval>=1.0.10 in /usr/local/lib/python3.10/dist-packages (from ir-measures>=0.3.1->python-terrier) (1.0.12)\n", + "Requirement already satisfied: cbor>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from trec-car-tools>=2.5.4->ir-datasets) (1.0.0)\n", + "Requirement already satisfied: heapdict in /usr/local/lib/python3.10/dist-packages (from chest->python-terrier) (1.0.1)\n", + "Requirement already satisfied: wrapt<2,>=1.10 in /usr/local/lib/python3.10/dist-packages (from deprecated->python-terrier) (1.16.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->python-terrier) (2.1.3)\n", + "Requirement already satisfied: multiset<3.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from matchpy->python-terrier) (2.1.1)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->tira) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->tira) (2023.3.post1)\n", + "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas->tira) (2023.3)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->python-terrier) (3.2.0)\n", + "Requirement already satisfied: patsy>=0.5.2 in /usr/local/lib/python3.10/dist-packages (from statsmodels->python-terrier) (0.5.4)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from patsy>=0.5.2->statsmodels->python-terrier) (1.16.0)\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + } + ], "source": [ "# Install necessary libraries\n", "!pip3 install tira ir-datasets python-terrier nltk" @@ -19,7 +86,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -37,7 +104,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -48,7 +115,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -59,7 +126,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -80,28 +147,28 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "# Display some example data\n", - "topics = dataset.get_topics('text')\n", - "qrels = dataset.get_qrels()\n" + "#topics = dataset.get_topics('text')\n", + "#qrels = dataset.get_qrels()\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ - "print(topics.head())\n", - "print(qrels.head())" + "#print(topics.head())\n", + "#print(qrels.head())" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -119,7 +186,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -129,7 +196,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -146,34 +213,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ - "# Evaluate BM25\n", - "bm25_results = bm25.transform(topics)\n", - "bm25_metrics = pt.Experiment(\n", - " [bm25],\n", - " topics,\n", - " qrels,\n", - " eval_metrics=[\"map\", \"ndcg\", \"recip_rank\"]\n", - ")" + "# # Evaluate BM25\n", + "# bm25_results = bm25.transform(topics)\n", + "# bm25_metrics = pt.Experiment(\n", + "# [bm25],\n", + "# dataset.get_topics(), \n", + "# dataset.get_qrels(), \n", + "# eval_metrics=[\"map\", \"ndcg\", \"recip_rank\"]\n", + "# )" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ - "# Evaluate TF-IDF\n", - "tfidf_results = tfidf.transform(topics)\n", - "tfidf_metrics = pt.Experiment(\n", - " [tfidf],\n", - " topics,\n", - " qrels,\n", - " eval_metrics=[\"map\", \"ndcg\", \"recip_rank\"]\n", - ")" + "# # Evaluate TF-IDF\n", + "# tfidf_results = tfidf.transform(topics)\n", + "# tfidf_metrics = pt.Experiment(\n", + "# [tfidf],\n", + "# topics,\n", + "# qrels,\n", + "# eval_metrics=[\"map\", \"ndcg\", \"recip_rank\"]\n", + "# )" ] }, { @@ -185,9 +252,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 42, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The run file is normalized outside the TIRA sandbox, I will store it at \"../runs\".\n", + "Done. run file is stored under \"../runs/run.txt\".\n", + "The run file is normalized outside the TIRA sandbox, I will store it at \"../runs\".\n", + "Done. run file is stored under \"../runs/run.txt\".\n" + ] + } + ], "source": [ "run = bm25_results\n", "persist_and_normalize_run(run, system_name='bm25-baseline', default_output='../runs')\n", @@ -198,13 +276,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ - "# Summarize findings\n", - "print(\"BM25 Metrics:\", bm25_metrics)\n", - "print(\"TF-IDF Metrics:\", tfidf_metrics)" + "# # Summarize findings\n", + "# print(\"BM25 Metrics:\", bm25_metrics)\n", + "# print(\"TF-IDF Metrics:\", tfidf_metrics)" ] }, { @@ -223,30 +301,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ - "# Local test of BM25\n", - "bm25_experiment = pt.Experiment(\n", - " [bm25],\n", - " topics,\n", - " qrels,\n", - " eval_metrics=['P_1000', 'map', 'recip_rank'],\n", - " names=['BM25'],\n", - " baseline=0\n", - ")" + "# # Local test of BM25\n", + "# bm25_experiment = pt.Experiment(\n", + "# [bm25],\n", + "# topics,\n", + "# qrels,\n", + "# eval_metrics=['P_1000', 'map', 'recip_rank'],\n", + "# names=['BM25'],\n", + "# baseline=0\n", + "# )" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ - "# Display BM25 results\n", - "print(\"BM25 Experiment Results\")\n", - "print(bm25_experiment)" + "# # Display BM25 results\n", + "# print(\"BM25 Experiment Results\")\n", + "# print(bm25_experiment)" ] }, { @@ -258,30 +336,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ - "# Local test of TF-IDF\n", - "tfidf_experiment = pt.Experiment(\n", - " [tfidf],\n", - " topics,\n", - " qrels,\n", - " eval_metrics=['P_1000', 'map', 'recip_rank'],\n", - " names=['TF-IDF'],\n", - " baseline=0\n", - ")" + "# # Local test of TF-IDF\n", + "# tfidf_experiment = pt.Experiment(\n", + "# [tfidf],\n", + "# topics,\n", + "# qrels,\n", + "# eval_metrics=['P_1000', 'map', 'recip_rank'],\n", + "# names=['TF-IDF'],\n", + "# baseline=0\n", + "# )" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ - "# Display TF-IDF results\n", - "print(\"TF-IDF Experiment Results\")\n", - "print(tfidf_experiment)" + "# # Display TF-IDF results\n", + "# print(\"TF-IDF Experiment Results\")\n", + "# print(tfidf_experiment)" ] } ],