diff --git a/tutorials/tutorial-entity-linking-in-progress.ipynb b/tutorials/tutorial-entity-linking-in-progress.ipynb index 633b482..7ae90f8 100644 --- a/tutorials/tutorial-entity-linking-in-progress.ipynb +++ b/tutorials/tutorial-entity-linking-in-progress.ipynb @@ -1,33 +1,19 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, "cells": [ { "cell_type": "markdown", + "metadata": { + "id": "w8g9eAcFXPPh" + }, "source": [ "# A work in progress notebook for entity linking\n", "\n", "(Submission is currently in progress, looks like we have to lowercase all queries before linking the entities, currently discussing this with Marcel)" - ], - "metadata": { - "id": "w8g9eAcFXPPh" - } + ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -35,148 +21,7 @@ "id": "GCPbVYynSBnZ", "outputId": "4fd50167-8d5e-4064-d750-120902515118" }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Collecting python-terrier\n", - " Downloading python-terrier-0.10.0.tar.gz (107 kB)\n", - "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/107.6 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━\u001b[0m \u001b[32m102.4/107.6 kB\u001b[0m \u001b[31m3.2 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m107.6/107.6 kB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "Collecting tira\n", - " Downloading tira-0.0.103-py3-none-any.whl (46 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.4/46.4 kB\u001b[0m \u001b[31m6.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting ir-datasets\n", - " Downloading ir_datasets-0.5.6-py3-none-any.whl (335 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m335.2/335.2 kB\u001b[0m \u001b[31m14.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.25.2)\n", - "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.5.3)\n", - "Collecting wget (from python-terrier)\n", - " Downloading wget-3.2.zip (10 kB)\n", - " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from python-terrier) (4.66.2)\n", - "Collecting pyjnius>=1.4.2 (from python-terrier)\n", - " Downloading pyjnius-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m38.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting matchpy (from python-terrier)\n", - " Downloading matchpy-0.5.5-py3-none-any.whl (69 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m69.6/69.6 kB\u001b[0m \u001b[31m11.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.2.2)\n", - "Collecting deprecated (from python-terrier)\n", - " Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)\n", - "Collecting chest (from python-terrier)\n", - " Downloading chest-0.2.3.tar.gz (9.6 kB)\n", - " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.11.4)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from python-terrier) (2.31.0)\n", - "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from python-terrier) (1.3.2)\n", - "Collecting nptyping==1.4.4 (from python-terrier)\n", - " Downloading nptyping-1.4.4-py3-none-any.whl (31 kB)\n", - "Requirement already satisfied: more_itertools in /usr/local/lib/python3.10/dist-packages (from python-terrier) (10.1.0)\n", - "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from python-terrier) (3.1.3)\n", - "Requirement already satisfied: statsmodels in /usr/local/lib/python3.10/dist-packages (from python-terrier) (0.14.1)\n", - "Collecting ir_measures>=0.3.1 (from python-terrier)\n", - " Downloading ir_measures-0.3.3.tar.gz (48 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m48.8/48.8 kB\u001b[0m \u001b[31m5.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "Collecting dill (from python-terrier)\n", - " Downloading dill-0.3.8-py3-none-any.whl (116 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m14.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting pytrec_eval_terrier>=0.5.3 (from python-terrier)\n", - " Downloading pytrec_eval_terrier-0.5.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (287 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m287.4/287.4 kB\u001b[0m \u001b[31m28.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting typish>=1.7.0 (from nptyping==1.4.4->python-terrier)\n", - " Downloading typish-1.9.3-py3-none-any.whl (45 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.1/45.1 kB\u001b[0m \u001b[31m6.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting docker==6.*,>=6.0.0 (from tira)\n", - " Downloading docker-6.1.3-py3-none-any.whl (148 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m148.1/148.1 kB\u001b[0m \u001b[31m14.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: packaging>=14.0 in /usr/local/lib/python3.10/dist-packages (from docker==6.*,>=6.0.0->tira) (23.2)\n", - "Requirement already satisfied: urllib3>=1.26.0 in /usr/local/lib/python3.10/dist-packages (from docker==6.*,>=6.0.0->tira) (2.0.7)\n", - "Requirement already satisfied: websocket-client>=0.32.0 in /usr/local/lib/python3.10/dist-packages (from docker==6.*,>=6.0.0->tira) (1.7.0)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->python-terrier) (3.3.2)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->python-terrier) (3.6)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->python-terrier) (2024.2.2)\n", - "Requirement already satisfied: beautifulsoup4>=4.4.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (4.12.3)\n", - "Collecting inscriptis>=2.2.0 (from ir-datasets)\n", - " Downloading inscriptis-2.4.0.1-py3-none-any.whl (41 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.7/41.7 kB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: lxml>=4.5.2 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (4.9.4)\n", - "Requirement already satisfied: pyyaml>=5.3.1 in /usr/local/lib/python3.10/dist-packages (from ir-datasets) (6.0.1)\n", - "Collecting trec-car-tools>=2.5.4 (from ir-datasets)\n", - " Downloading trec_car_tools-2.6-py3-none-any.whl (8.4 kB)\n", - "Collecting lz4>=3.1.10 (from ir-datasets)\n", - " Downloading lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m52.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting warc3-wet>=0.2.3 (from ir-datasets)\n", - " Downloading warc3_wet-0.2.3-py3-none-any.whl (13 kB)\n", - "Collecting warc3-wet-clueweb09>=0.2.5 (from ir-datasets)\n", - " Downloading warc3-wet-clueweb09-0.2.5.tar.gz (17 kB)\n", - " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "Collecting zlib-state>=0.1.3 (from ir-datasets)\n", - " Downloading zlib-state-0.1.6.tar.gz (9.5 kB)\n", - " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", - " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", - " Installing backend dependencies ... \u001b[?25l\u001b[?25hdone\n", - " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - "Collecting ijson>=3.1.3 (from ir-datasets)\n", - " Downloading ijson-3.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (111 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m111.8/111.8 kB\u001b[0m \u001b[31m10.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting pyautocorpus>=0.1.1 (from ir-datasets)\n", - " Downloading pyautocorpus-0.1.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (379 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m379.9/379.9 kB\u001b[0m \u001b[31m43.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting unlzw3>=0.2.1 (from ir-datasets)\n", - " Downloading unlzw3-0.2.2-py3-none-any.whl (6.1 kB)\n", - "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4>=4.4.1->ir-datasets) (2.5)\n", - "Collecting cwl-eval>=1.0.10 (from ir_measures>=0.3.1->python-terrier)\n", - " Downloading cwl-eval-1.0.12.tar.gz (31 kB)\n", - " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "Collecting cbor>=1.0.0 (from trec-car-tools>=2.5.4->ir-datasets)\n", - " Downloading cbor-1.0.0.tar.gz (20 kB)\n", - " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "Collecting heapdict (from chest->python-terrier)\n", - " Downloading HeapDict-1.0.1-py3-none-any.whl (3.9 kB)\n", - "Requirement already satisfied: wrapt<2,>=1.10 in /usr/local/lib/python3.10/dist-packages (from deprecated->python-terrier) (1.14.1)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->python-terrier) (2.1.5)\n", - "Collecting multiset<3.0,>=2.0 (from matchpy->python-terrier)\n", - " Downloading multiset-2.1.1-py2.py3-none-any.whl (8.8 kB)\n", - "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->python-terrier) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->python-terrier) (2023.4)\n", - "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->python-terrier) (3.3.0)\n", - "Requirement already satisfied: patsy>=0.5.4 in /usr/local/lib/python3.10/dist-packages (from statsmodels->python-terrier) (0.5.6)\n", - "Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from patsy>=0.5.4->statsmodels->python-terrier) (1.16.0)\n", - "Building wheels for collected packages: python-terrier, ir_measures, warc3-wet-clueweb09, zlib-state, chest, wget, cbor, cwl-eval\n", - " Building wheel for python-terrier (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for python-terrier: filename=python_terrier-0.10.0-py3-none-any.whl size=115532 sha256=50793be4120fd471c5b97dd411e04fecaca0bd6bc445b7077781da0951aaa10d\n", - " Stored in directory: /root/.cache/pip/wheels/79/7c/8f/679a982895c53af35178eceda648a4bc9a9af6af5542e31a0e\n", - " Building wheel for ir_measures (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for ir_measures: filename=ir_measures-0.3.3-py3-none-any.whl size=61182 sha256=fc2159e8a5a993fa86e25d38279d48605b80403802fedb08ca523007f1dea5cf\n", - " Stored in directory: /root/.cache/pip/wheels/9f/0e/22/718279f23fef1673a4c5e433881c25080a6afaa147e007183e\n", - " Building wheel for warc3-wet-clueweb09 (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for warc3-wet-clueweb09: filename=warc3_wet_clueweb09-0.2.5-py3-none-any.whl size=18919 sha256=ca6cd3dab107bc599f7640bb45bab482193c185159e30978ca1ed158a6aa22c2\n", - " Stored in directory: /root/.cache/pip/wheels/1a/d7/91/7ffb991df87e62355d945745035470ba2616aa3d83a250b5f9\n", - " Building wheel for zlib-state (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for zlib-state: filename=zlib_state-0.1.6-cp310-cp310-linux_x86_64.whl size=21163 sha256=4a2c2c56a0ff4226645e6b46f778a32c59eae6e7483d541f71fe67bb0d605ad8\n", - " Stored in directory: /root/.cache/pip/wheels/32/72/7e/aff80f26e926b6e1fb08dfb52aba03c0e058f5e2258deb50a9\n", - " Building wheel for chest (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for chest: filename=chest-0.2.3-py3-none-any.whl size=7612 sha256=72b291ba41b17157da60c20e1910e9386b01d5bb710084bcf12d65f3779a98c3\n", - " Stored in directory: /root/.cache/pip/wheels/88/cf/99/4773b31f855f9ecedc32a0ae400f7a4a3001b37c439b6d1a73\n", - " Building wheel for wget (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9655 sha256=af9e11fe4e7e1706ceb4cd568cc2387ca8b98f188a1293ad3f7b9e2f86030149\n", - " Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769\n", - " Building wheel for cbor (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for cbor: filename=cbor-1.0.0-cp310-cp310-linux_x86_64.whl size=53431 sha256=56d67264a9278d7fa7ec8ec4172bf07bea41fa4f2c98a58550557c1844818330\n", - " Stored in directory: /root/.cache/pip/wheels/85/df/c9/b39e40eccaf76dbd218556639a6dc81562226f4c6a64902c85\n", - " Building wheel for cwl-eval (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for cwl-eval: filename=cwl_eval-1.0.12-py3-none-any.whl size=38068 sha256=d38c3a0712cb7d9108554b67a17b806d5765cfbb79e4e44e4ebde6dbd9d4c8af\n", - " Stored in directory: /root/.cache/pip/wheels/3d/c1/94/94a3e5379b1aa8fb7c7f1ad1956305d5edc98ef745b6067d87\n", - "Successfully built python-terrier ir_measures warc3-wet-clueweb09 zlib-state chest wget cbor cwl-eval\n", - "Installing collected packages: wget, warc3-wet-clueweb09, warc3-wet, typish, pyjnius, multiset, ijson, heapdict, cbor, zlib-state, unlzw3, trec-car-tools, pytrec_eval_terrier, pyautocorpus, nptyping, matchpy, lz4, dill, deprecated, cwl-eval, chest, ir_measures, inscriptis, docker, tira, ir-datasets, python-terrier\n", - "Successfully installed cbor-1.0.0 chest-0.2.3 cwl-eval-1.0.12 deprecated-1.2.14 dill-0.3.8 docker-6.1.3 heapdict-1.0.1 ijson-3.2.3 inscriptis-2.4.0.1 ir-datasets-0.5.6 ir_measures-0.3.3 lz4-4.3.3 matchpy-0.5.5 multiset-2.1.1 nptyping-1.4.4 pyautocorpus-0.1.12 pyjnius-1.6.1 python-terrier-0.10.0 pytrec_eval_terrier-0.5.6 tira-0.0.103 trec-car-tools-2.6 typish-1.9.3 unlzw3-0.2.2 warc3-wet-0.2.3 warc3-wet-clueweb09-0.2.5 wget-3.2 zlib-state-0.1.6\n" - ] - } - ], + "outputs": [], "source": [ "# Only needed in Colab, in codespaces everything is already installed.\n", "!pip3 install python-terrier tira ir-datasets" @@ -184,6 +29,15 @@ }, { "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "kP6nwrlJSSUw", + "outputId": "14b6a45d-5b30-4a74-eb1a-ef60a47294e6" + }, + "outputs": [], "source": [ "import pyterrier as pt\n", "\n", @@ -192,32 +46,11 @@ "\n", "from tira.rest_api_client import Client\n", "tira = Client()" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "kP6nwrlJSSUw", - "outputId": "14b6a45d-5b30-4a74-eb1a-ef60a47294e6" - }, - "execution_count": 1, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8\n", - "\n" - ] - } ] }, { "cell_type": "code", - "source": [ - "dataset = pt.get_dataset(\"irds:disks45/nocr/trec-robust-2004\")\n", - "topics = dataset.get_topics(variant='title')\n" - ], + "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -226,287 +59,15 @@ "id": "n8keQrBMVUR_", "outputId": "c987dcbb-84fa-4fed-b78c-0ee9a911d302" }, - "execution_count": 4, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " qid query\n", - "0 301 international organized crime\n", - "1 302 poliomyelitis and post polio\n", - "2 303 hubble telescope achievements" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
qidquery
0301international organized crime
1302poliomyelitis and post polio
2303hubble telescope achievements
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - " \n", - "
\n", - "
\n", - "
\n" - ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "dataframe", - "variable_name": "topics", - "summary": "{\n \"name\": \"topics\",\n \"rows\": 250,\n \"fields\": [\n {\n \"column\": \"qid\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 250,\n \"samples\": [\n \"443\",\n \"307\",\n \"398\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"query\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 249,\n \"samples\": [\n \"inventions scientific discoveries\",\n \"new hydroelectric projects\",\n \"dismantling europe s arsenal\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" - } - }, - "metadata": {}, - "execution_count": 4 - } + "outputs": [], + "source": [ + "dataset = pt.get_dataset(\"irds:disks45/nocr/trec-robust-2004\")\n", + "topics = dataset.get_topics(variant='title')\n" ] }, { "cell_type": "code", - "source": [ - "query_entity_linking = tira.pt.transform_queries('ir-benchmarks/marcel-gohsen/courtly-vision', dataset)\n", - "query_entity_linking(topics)" - ], + "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -515,57 +76,11 @@ "id": "tqOFa0PJVcvW", "outputId": "83838d44-ed55-4bf4-eeff-d2244b639652" }, - "execution_count": 6, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": [ - " qid query \\\n", - "0 301 International Organized Crime \n", - "1 302 Poliomyelitis and Post-Polio \n", - "2 303 Hubble Telescope Achievements \n", - "3 304 Endangered Species (Mammals) \n", - "4 305 Most Dangerous Vehicles \n", - ".. ... ... \n", - "245 696 safety plastic surgery \n", - "246 697 air traffic controller \n", - "247 698 literacy rates Africa \n", - "248 699 term limits \n", - "249 700 gasoline tax U.S. \n", - "\n", - " original_query \\\n", - "0 {'query_id': '301', 'title': 'International Or... \n", - "1 {'query_id': '302', 'title': 'Poliomyelitis an... \n", - "2 {'query_id': '303', 'title': 'Hubble Telescope... \n", - "3 {'query_id': '304', 'title': 'Endangered Speci... \n", - "4 {'query_id': '305', 'title': 'Most Dangerous V... \n", - ".. ... \n", - "245 {'query_id': '696', 'title': 'safety plastic s... \n", - "246 {'query_id': '697', 'title': 'air traffic cont... \n", - "247 {'query_id': '698', 'title': 'literacy rates A... \n", - "248 {'query_id': '699', 'title': 'term limits', 'd... \n", - "249 {'query_id': '700', 'title': 'gasoline tax U.S... \n", - "\n", - " entities \n", - "0 [] \n", - "1 [] \n", - "2 [] \n", - "3 [] \n", - "4 [] \n", - ".. ... \n", - "245 [{'begin': 7, 'end': 22, 'mention': 'plastic s... \n", - "246 [{'begin': 0, 'end': 22, 'mention': 'air traff... \n", - "247 [{'begin': 0, 'end': 8, 'mention': 'literacy',... \n", - "248 [{'begin': 0, 'end': 4, 'mention': 'term', 'ur... \n", - "249 [{'begin': 0, 'end': 8, 'mention': 'gasoline',... \n", - "\n", - "[250 rows x 4 columns]" - ], "text/html": [ - "\n", - "
\n", - "
\n", + "
\n", "\n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", + " entities \n", + "0 [] \n", + "1 [] \n", + "2 [] \n", + "3 [] \n", + "4 [] \n", + ".. ... \n", + "245 [{'begin': 7, 'end': 22, 'mention': 'plastic s... \n", + "246 [{'begin': 0, 'end': 22, 'mention': 'air traff... \n", + "247 [{'begin': 0, 'end': 8, 'mention': 'literacy',... \n", + "248 [{'begin': 0, 'end': 4, 'mention': 'term', 'ur... \n", + "249 [{'begin': 0, 'end': 8, 'mention': 'gasoline',... \n", "\n", - " \n", - "
\n", - "
\n", - "
\n" - ], - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "dataframe", - "variable_name": "topics", - "summary": "{\n \"name\": \"topics\",\n \"rows\": 250,\n \"fields\": [\n {\n \"column\": \"qid\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 250,\n \"samples\": [\n \"443\",\n \"307\",\n \"398\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"query\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 250,\n \"samples\": [\n \"U.S., investment, Africa\",\n \"New Hydroelectric Projects\",\n \"dismantling Europe's arsenal\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"original_query\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"entities\",\n \"properties\": {\n \"dtype\": \"object\",\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" - } + "[250 rows x 4 columns]" + ] }, + "execution_count": 4, "metadata": {}, - "execution_count": 6 + "output_type": "execute_result" } + ], + "source": [ + "query_entity_linking = tira.pt.transform_queries('ir-benchmarks/marcel-gohsen/entity-linking', dataset)\n", + "query_entity_linking(topics)" ] }, { "cell_type": "code", - "source": [ - "query_entity_linking(topics).iloc[247].to_dict()" - ], + "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -903,10 +250,8 @@ "id": "08x9c7tjWTAe", "outputId": "10092391-6a28-4ecd-fe76-c9120e15af80" }, - "execution_count": 9, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "{'qid': '698',\n", @@ -932,16 +277,18 @@ " 'score': 0.012711864406779001}]}" ] }, + "execution_count": 5, "metadata": {}, - "execution_count": 9 + "output_type": "execute_result" } + ], + "source": [ + "query_entity_linking(topics).iloc[247].to_dict()" ] }, { "cell_type": "code", - "source": [ - "query_entity_linking(topics).iloc[249].to_dict()" - ], + "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -949,10 +296,8 @@ "id": "KubJXyAsV8cX", "outputId": "03ddd508-d0e2-4019-a0c8-71bad902f694" }, - "execution_count": 8, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "{'qid': '700',\n", @@ -1018,10 +363,320 @@ " 'score': 0.0003222687721559781}]}" ] }, + "execution_count": 6, "metadata": {}, - "execution_count": 8 + "output_type": "execute_result" } + ], + "source": [ + "query_entity_linking(topics).iloc[249].to_dict()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Query Interpretation" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Download from the Incubator: https://files.webis.de/data-in-production/data-research/tira-zenodo-dump-preparation/query-processors-in-progress/marcel-gohsen-query-interpretation-trec-core.zip\n", + "\tThis is only used for last spot checks before archival to Zenodo.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Download: 100%|██████████| 98.5k/98.5k [00:00<00:00, 1.92MiB/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Download finished. Extract...\n", + "Extraction finished: /root/.tira/extracted_runs/ir-benchmarks/disks45-nocr-trec-robust-2004-20230209-training/marcel-gohsen\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
qidqueryoriginal_queryinterpretations
0301International Organized Crime{'query_id': '301', 'title': 'International Or...[{'id': 0, 'interpretation': ['international',...
1302Poliomyelitis and Post-Polio{'query_id': '302', 'title': 'Poliomyelitis an...[{'id': 0, 'interpretation': ['https://en.wiki...
2303Hubble Telescope Achievements{'query_id': '303', 'title': 'Hubble Telescope...[{'id': 0, 'interpretation': ['https://en.wiki...
3304Endangered Species (Mammals){'query_id': '304', 'title': 'Endangered Speci...[{'id': 0, 'interpretation': ['https://en.wiki...
4305Most Dangerous Vehicles{'query_id': '305', 'title': 'Most Dangerous V...[{'id': 0, 'interpretation': ['most dangerous'...
...............
245696safety plastic surgery{'query_id': '696', 'title': 'safety plastic s...[{'id': 0, 'interpretation': ['safety', 'https...
246697air traffic controller{'query_id': '697', 'title': 'air traffic cont...[{'id': 0, 'interpretation': ['https://en.wiki...
247698literacy rates Africa{'query_id': '698', 'title': 'literacy rates A...[{'id': 0, 'interpretation': ['literacy rates'...
248699term limits{'query_id': '699', 'title': 'term limits', 'd...[{'id': 0, 'interpretation': ['https://en.wiki...
249700gasoline tax U.S.{'query_id': '700', 'title': 'gasoline tax U.S...[{'id': 0, 'interpretation': ['https://en.wiki...
\n", + "

250 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " qid query \\\n", + "0 301 International Organized Crime \n", + "1 302 Poliomyelitis and Post-Polio \n", + "2 303 Hubble Telescope Achievements \n", + "3 304 Endangered Species (Mammals) \n", + "4 305 Most Dangerous Vehicles \n", + ".. ... ... \n", + "245 696 safety plastic surgery \n", + "246 697 air traffic controller \n", + "247 698 literacy rates Africa \n", + "248 699 term limits \n", + "249 700 gasoline tax U.S. \n", + "\n", + " original_query \\\n", + "0 {'query_id': '301', 'title': 'International Or... \n", + "1 {'query_id': '302', 'title': 'Poliomyelitis an... \n", + "2 {'query_id': '303', 'title': 'Hubble Telescope... \n", + "3 {'query_id': '304', 'title': 'Endangered Speci... \n", + "4 {'query_id': '305', 'title': 'Most Dangerous V... \n", + ".. ... \n", + "245 {'query_id': '696', 'title': 'safety plastic s... \n", + "246 {'query_id': '697', 'title': 'air traffic cont... \n", + "247 {'query_id': '698', 'title': 'literacy rates A... \n", + "248 {'query_id': '699', 'title': 'term limits', 'd... \n", + "249 {'query_id': '700', 'title': 'gasoline tax U.S... \n", + "\n", + " interpretations \n", + "0 [{'id': 0, 'interpretation': ['international',... \n", + "1 [{'id': 0, 'interpretation': ['https://en.wiki... \n", + "2 [{'id': 0, 'interpretation': ['https://en.wiki... \n", + "3 [{'id': 0, 'interpretation': ['https://en.wiki... \n", + "4 [{'id': 0, 'interpretation': ['most dangerous'... \n", + ".. ... \n", + "245 [{'id': 0, 'interpretation': ['safety', 'https... \n", + "246 [{'id': 0, 'interpretation': ['https://en.wiki... \n", + "247 [{'id': 0, 'interpretation': ['literacy rates'... \n", + "248 [{'id': 0, 'interpretation': ['https://en.wiki... \n", + "249 [{'id': 0, 'interpretation': ['https://en.wiki... \n", + "\n", + "[250 rows x 4 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query_interpretation = tira.pt.transform_queries('ir-benchmarks/marcel-gohsen/query-interpretation', dataset)\n", + "query_interpretation(topics)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'qid': '698',\n", + " 'query': 'literacy rates Africa',\n", + " 'original_query': {'query_id': '698',\n", + " 'title': 'literacy rates Africa',\n", + " 'description': 'What are literacy rates in African countries?',\n", + " 'narrative': 'A relevant document will contain information about the\\nliteracy rate in an African country.\\nGeneral education levels that do not specifically include literacy rates\\nare not relevant.'},\n", + " 'interpretations': [{'id': 0,\n", + " 'interpretation': ['literacy rates',\n", + " 'https://en.wikipedia.org/wiki/Africa'],\n", + " 'relevance': 1.017462925635508,\n", + " 'containedEntities': ['https://en.wikipedia.org/wiki/Africa'],\n", + " 'contextWords': ['literacy', 'rates'],\n", + " 'score': 1.017462925635508},\n", + " {'id': 0,\n", + " 'interpretation': ['https://en.wikipedia.org/wiki/List_of_countries_by_literacy_rate',\n", + " 'https://en.wikipedia.org/wiki/Africa'],\n", + " 'relevance': 0.710923257536899,\n", + " 'containedEntities': ['https://en.wikipedia.org/wiki/List_of_countries_by_literacy_rate',\n", + " 'https://en.wikipedia.org/wiki/Africa'],\n", + " 'contextWords': [],\n", + " 'score': 0.710923257536899},\n", + " {'id': 0,\n", + " 'interpretation': ['https://en.wikipedia.org/wiki/List_of_countries_by_literacy_rate',\n", + " 'africa'],\n", + " 'relevance': 0.6129202512020191,\n", + " 'containedEntities': ['https://en.wikipedia.org/wiki/List_of_countries_by_literacy_rate'],\n", + " 'contextWords': ['africa'],\n", + " 'score': 0.6129202512020191}]}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query_interpretation(topics).iloc[247].to_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'qid': '700',\n", + " 'query': 'gasoline tax U.S.',\n", + " 'original_query': {'query_id': '700',\n", + " 'title': 'gasoline tax U.S.',\n", + " 'description': 'What are the arguments for and against an increase in gasoline\\ntaxes in the U.S.?',\n", + " 'narrative': 'Relevant documents present reasons for or against raising gasoline taxes\\nin the U.S. Documents discussing rises or decreases in the price of\\ngasoline are not relevant.'},\n", + " 'interpretations': [{'id': 0,\n", + " 'interpretation': ['https://en.wikipedia.org/wiki/Fuel_tax', 'u.s.'],\n", + " 'relevance': 0.106382978723404,\n", + " 'containedEntities': ['https://en.wikipedia.org/wiki/Fuel_tax'],\n", + " 'contextWords': ['u.s.'],\n", + " 'score': 0.106382978723404}]}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query_interpretation(topics).iloc[249].to_dict()" ] } - ] -} \ No newline at end of file + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}