From 4a98e1ab804548464fda439298d7021188ca9f0a Mon Sep 17 00:00:00 2001 From: Renato Durrer Date: Mon, 8 Apr 2024 21:28:00 +0200 Subject: [PATCH] EDA on new set of borehole profiles. --- environment-dev.yml | 2 + .../Borehole_Profile_V2_Exploration.ipynb | 926 ++++++++++++++++++ 2 files changed, 928 insertions(+) create mode 100644 notebooks/Borehole_Profile_V2_Exploration.ipynb diff --git a/environment-dev.yml b/environment-dev.yml index 1d5acb5e..29edd954 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -20,6 +20,8 @@ dependencies: - black==24.2.0 - pre-commit==3.6.2 - pytest==8.1.1 + - langdetect==1.0.9 + - tqdm==4.66.2 - pip: # prod pip dependencies; needs to be a strict copy of environment-prod.yml - amazon-textract-textractor diff --git a/notebooks/Borehole_Profile_V2_Exploration.ipynb b/notebooks/Borehole_Profile_V2_Exploration.ipynb new file mode 100644 index 00000000..16bee109 --- /dev/null +++ b/notebooks/Borehole_Profile_V2_Exploration.ipynb @@ -0,0 +1,926 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9f3aa641", + "metadata": {}, + "source": [ + "# EDA on Second Set of Borehole Profiles" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "eacdbb5e", + "metadata": {}, + "outputs": [], + "source": [ + "from stratigraphy import DATAPATH\n", + "import json" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "90e4aab6-f21b-422f-9278-2d3a7715bc4f", + "metadata": {}, + "outputs": [], + "source": [ + "input_directory = DATAPATH / \"data_v2\"\n", + "ground_truth_path = input_directory / \"geoquat_ground_truth.json\"\n", + "predictions_path = input_directory / \"extract\" / \"predictions.json\"" + ] + }, + { + "cell_type": "markdown", + "id": "62ceb4d4", + "metadata": {}, + "source": [ + "## Check if ground truth is present for all files" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "fecb9c65-5fb2-498b-a7c4-439fd2e812b2", + "metadata": {}, + "outputs": [], + "source": [ + "with open(ground_truth_path) as in_file:\n", + " ground_truth = json.load(in_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e9be30e3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 1658 pdf files in the input directory.\n" + ] + } + ], + "source": [ + "# count all pdf files in input directory\n", + "pdf_files = list(input_directory.glob(\"*.pdf\"))\n", + "n_pdf_files = len(pdf_files)\n", + "print(f\"There are {n_pdf_files} pdf files in the input directory.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "0882c4a9-e1b3-465e-8da5-42547b61dfeb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There is ground_truth data for 2137 pdf files. There are no duplicated keys.\n" + ] + } + ], + "source": [ + "print(f\"There is ground_truth data for {len(ground_truth.keys())} pdf files. There are no duplicated keys.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b0fb5110", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 1658 pdf files with ground truth data.\n" + ] + } + ], + "source": [ + "# Check the overlap between ground truth and available pdfs\n", + "ground_truth_keys = set(ground_truth.keys())\n", + "pdf_keys = set([str(pdf_path).split('/')[-1] for pdf_path in pdf_files])\n", + "intersection = ground_truth_keys.intersection(pdf_keys)\n", + "print(f\"There are {len(intersection)} pdf files with ground truth data.\")" + ] + }, + { + "cell_type": "markdown", + "id": "350b2eec", + "metadata": {}, + "source": [ + "We know that there is ground truth data for each pdf. More, there is also ground truth for pdfs that we don't have." + ] + }, + { + "cell_type": "markdown", + "id": "55ee4626", + "metadata": {}, + "source": [ + "## Check language distribution of the borehole profiles" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "697a4493", + "metadata": {}, + "outputs": [], + "source": [ + "from langdetect import detect\n", + "import fitz\n", + "from tqdm import tqdm\n", + "import matplotlib.pyplot as plt\n", + "from collections import defaultdict" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6c06ec7f", + "metadata": {}, + "outputs": [], + "source": [ + "def detect_language(text):\n", + " try:\n", + " return detect(text)\n", + " except:\n", + " return \"Language detection failed\"" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "e85fca47", + "metadata": {}, + "outputs": [], + "source": [ + "def extract_text_from_pdf(file_path):\n", + " doc = fitz.open(file_path)\n", + " text = \"\"\n", + " for page in doc:\n", + " text += page.get_text()\n", + " return text" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f552de08", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/1658 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# plot language distribution\n", + "plt.bar(languages.keys(), languages.values())\n", + "plt.xlabel(\"Language\")\n", + "plt.ylabel(\"Number of PDF files\")\n", + "plt.title(\"Language distribution in PDF files\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "e5959dcd", + "metadata": {}, + "source": [ + "### The language detection is sometimes wrong - Examples\n", + "In some cases the wrong language is detected. It seems this is due to the text recognition, and not acutally a fault of the language detection.\n", + "Sometimes text is very badly recognized, and sometimes there is text in the pdf's that is not supposed to be there." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "6fa25548", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Geobechnisches Institute RG Rotationsbohrung RB P Ben Gortenstrouse a Selection on M RtNe Objett Geverbezentrum Melliger Chutzenstrasse Hone in in M Stung tohn AS Koordination Bohrneight Mr Moser He Florl Deturn LN Tete Profit Georechrische Bezeichnung Geal Benerlungen silger Kare Send with I King IN and Sand AT partner snipsitiger Send AT againstre in braungrau Kee Bent AS a may sends new ship offer Tax Send Mino Grobanc veric Kor and sitigen Lines Fair Grobeard as site signature when order wardland leader nitworig Feinward pebtrace an onlession and since Fairward brown graver Persend probrain audit differe Painsant at Personal one press leught Waterlapering and Send to Assistic grauget braun leaft as Fairmand terling Math sitigar Sand Mississed teache sition Sand work Kim achweek I Keep nb fined sites transport Kar and Sand at sitip Staine SPT be be an due de Bellage a b c SPT d d '" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Some examples have strange text in them. Example of A8570.pdf. Document is in German, but there seems to be a lot of\n", + "# English text in it. The actual document does not contain this English text\n", + "text = extract_text_from_pdf(files_per_language['en'][-2]).replace(\"\\n\", \" \")\n", + "''.join(e for e in text if (e.isalnum() or e.isspace()) and not e.isdigit())" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "c1f8d4b3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Rb Didnt auffullery Tart unit Sandand filtsinlogegr land filt unit ind Sand u weing this stell waise samber fand fitty fand unit kies silt unit Sand and weing kies '" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Example A8504 --> almost no Text recognized. The document is actually in German\n", + "text = extract_text_from_pdf(files_per_language['en'][-5]).replace(\"\\n\", \" \")\n", + "''.join(e for e in text if (e.isalnum() or e.isspace()) and not e.isdigit())" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "70a4747f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'which getter send Author V BI '" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Example A8530 --> OCR errorfull. The document is actually in German\n", + "text = extract_text_from_pdf(files_per_language['en'][-7]).replace(\"\\n\", \" \")\n", + "''.join(e for e in text if (e.isalnum() or e.isspace()) and not e.isdigit())" + ] + }, + { + "cell_type": "markdown", + "id": "c791c92e", + "metadata": {}, + "source": [ + "## Check Content of Ground Truth" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "10ed5f38", + "metadata": {}, + "outputs": [], + "source": [ + "# get 10 random keys from ground truth\n", + "import random\n", + "random_keys = random.sample(list(intersection), 10)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "6df98c9e", + "metadata": {}, + "outputs": [], + "source": [ + "# fix the keys\n", + "random_keys = ['A11145.pdf',\n", + " 'A11132.pdf',\n", + " 'A11251.pdf',\n", + " 'B598.pdf',\n", + " 'A11440.pdf',\n", + " 'A11329.pdf',\n", + " '12326.pdf',\n", + " '8367.pdf',\n", + " 'A7171.pdf',\n", + " 'A8291.pdf']" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "4670b639", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'material_description': 'Humus, dunkelbraun',\n", + " 'depth_interval': {'start': 0.0, 'end': 0.5}},\n", + " {'material_description': 'Silt, Sand + Grobkies, braun',\n", + " 'depth_interval': {'start': 0.5, 'end': 1.0}},\n", + " {'material_description': 'siltiger Sand, braun',\n", + " 'depth_interval': {'start': 1.0, 'end': 2.1}},\n", + " {'material_description': 'sandiger Silt, organ. Verunreinigt, graugelb',\n", + " 'depth_interval': {'start': 2.1, 'end': 2.3}},\n", + " {'material_description': 'Torf + Silt, schwarz bis grau',\n", + " 'depth_interval': {'start': 2.3, 'end': 2.8}},\n", + " {'material_description': 'Silt mit Feinsand, graugelb',\n", + " 'depth_interval': {'start': 2.8, 'end': 3.2}},\n", + " {'material_description': 'Siltiger Sand, grau bis dunkelgrau',\n", + " 'depth_interval': {'start': 3.2, 'end': 3.9}},\n", + " {'material_description': 'siltiger Sand mit Kies, bräunlich-grau',\n", + " 'depth_interval': {'start': 3.9, 'end': 4.35}},\n", + " {'material_description': 'siltiger Kiessand, rostrot',\n", + " 'depth_interval': {'start': 4.35, 'end': 5.1}},\n", + " {'material_description': 'Kiessand mit einzelne siltigen Zwischenlagen, grau',\n", + " 'depth_interval': {'start': 5.1, 'end': 8.0}},\n", + " {'material_description': 'toniger Silt (Seeton), dunkelgrau',\n", + " 'depth_interval': {'start': 8.0, 'end': 16.2}}]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "number_wrong = 0\n", + "fp = 0\n", + "fn = 0\n", + "tp = 0\n", + "total_number = 0\n", + "total_number += len(ground_truth[random_keys[0]][\"layers\"])\n", + "tp += len(ground_truth[random_keys[0]][\"layers\"])\n", + "ground_truth[random_keys[0]][\"layers\"] # all correct\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "72e28be2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'material_description': 'toniger Silt mit wenig Feinsand, bräunlichgrau',\n", + " 'depth_interval': {'start': 0.0, 'end': 2.75}},\n", + " {'material_description': 'leicht toniger Silt ab 5,80 bis 15,0 toniger Silt, dunkelgrau',\n", + " 'depth_interval': {'start': 2.75, 'end': 15.0}}]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fn += 1\n", + "tp += len(ground_truth[random_keys[1]][\"layers\"])\n", + "total_number += len(ground_truth[random_keys[1]][\"layers\"]) + 1\n", + "ground_truth[random_keys[1]][\"layers\"] # first layer is missing" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "817e7692", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'material_description': 'Humus, schwarz',\n", + " 'depth_interval': {'start': 0.0, 'end': 0.5}},\n", + " {'material_description': 'Silt mit viel Sand und wenig Kies, sowie Ziegelresten etc. Auffüllung, braun',\n", + " 'depth_interval': {'start': 0.5, 'end': 1.7}},\n", + " {'material_description': 'leicht siltiger Kies mit reichlich Sand und mit Steinen bis ¿15cm, braungrau',\n", + " 'depth_interval': {'start': 1.7, 'end': 4.5}},\n", + " {'material_description': 'toniger Silt mit wenig kleinen Steinen, braun grau',\n", + " 'depth_interval': {'start': 4.5, 'end': 6.0}},\n", + " {'material_description': 'toniger Silt mit sehr wenig kleinen Steinen, blau grau',\n", + " 'depth_interval': {'start': 6.0, 'end': 15.3}}]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tp += len(ground_truth[random_keys[2]][\"layers\"])\n", + "total_number += len(ground_truth[random_keys[2]][\"layers\"])\n", + "ground_truth[random_keys[2]][\"layers\"] # all correct" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "ca0578cc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'material_description': 'Humus',\n", + " 'depth_interval': {'start': 0.0, 'end': 0.6}},\n", + " {'material_description': 'Brauner, siltiger Sand, erdig',\n", + " 'depth_interval': {'start': 0.6, 'end': 1.7}},\n", + " {'material_description': 'Sand, Kies, leicht erdig',\n", + " 'depth_interval': {'start': 1.7, 'end': 2.0}},\n", + " {'material_description': 'Stark sandiger Kies, mit einz. Steinen',\n", + " 'depth_interval': {'start': 2.0, 'end': 4.0}},\n", + " {'material_description': 'Kies, einz. Steine mit wechselned wenig bis reichlich Sand',\n", + " 'depth_interval': {'start': 4.0, 'end': 7.5}},\n", + " {'material_description': 'Schuttmaterial? Kies und Steine, sandig, z.T. kantiges Material, erdig-braun',\n", + " 'depth_interval': {'start': 7.5, 'end': 8.5}},\n", + " {'material_description': 'Mergelkalk, grau, gebankt, Mergelzwischenlagen bis wenige cm. Stärke',\n", + " 'depth_interval': {'start': 8.5, 'end': 13.6}}]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fp += 2\n", + "total_number += len(ground_truth[random_keys[3]][\"layers\"])\n", + "tp += total_number - 2\n", + "ground_truth[random_keys[3]][\"layers\"] # two layers deviate in wording" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "cf7656f7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'material_description': 'Humus mit Steinen, braun',\n", + " 'depth_interval': {'start': 0.0, 'end': 0.4}},\n", + " {'material_description': 'Stark siltiger Kies mit viel Sand und Wurzeln humos wenig Steinen bis 20cm, braun',\n", + " 'depth_interval': {'start': 0.4, 'end': 1.8}},\n", + " {'material_description': 'leicht siltiger Kies ca. 20-30% grosse Steine bis 20cm eher locker, grau',\n", + " 'depth_interval': {'start': 1.8, 'end': 2.4}},\n", + " {'material_description': 'stark siltiger Sand mit viel Kies, z.T. leicht tonig, festgelagert, grau-gelblich',\n", + " 'depth_interval': {'start': 2.4, 'end': 3.5}},\n", + " {'material_description': 'stark toniger Silt, hartplastisch, gelb',\n", + " 'depth_interval': {'start': 3.5, 'end': 3.65}},\n", + " {'material_description': 'tonig siltiger Kies mit viel Sand, wenig grosse Steine bis 15cm, festgelagert, grau',\n", + " 'depth_interval': {'start': 3.65, 'end': 5.0}},\n", + " {'material_description': 'Wechsellagerungen von siltigen Fein-Mittelsand',\n", + " 'depth_interval': {'start': 5.0, 'end': 5.8}}]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tp += len(ground_truth[random_keys[4]][\"layers\"])\n", + "total_number += len(ground_truth[random_keys[4]][\"layers\"])\n", + "ground_truth[random_keys[4]][\"layers\"] # wording slightly different (diameter symbol exchanged with \"bis\")\n", + "# we consider this as correct though" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "88826b70", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'material_description': 'Humus',\n", + " 'depth_interval': {'start': 0.0, 'end': 0.1}},\n", + " {'material_description': 'brauner, stark siltiger Kies mit Sand',\n", + " 'depth_interval': {'start': 0.1, 'end': 0.5}},\n", + " {'material_description': 'grauer, leicht siltiger Kies mit reichlich bis viel Sand',\n", + " 'depth_interval': {'start': 0.5, 'end': 3.45}},\n", + " {'material_description': 'beiger, stark siltiger Kies mit Sand, wasserführend',\n", + " 'depth_interval': {'start': 3.45, 'end': 3.7}},\n", + " {'material_description': 'beiger, toniger Silt mit einzelnen kleinen Steinen, plastisch',\n", + " 'depth_interval': {'start': 3.7, 'end': 4.7}},\n", + " {'material_description': 'angewitterter Sandstein bei 5m Wasserführend',\n", + " 'depth_interval': {'start': 4.7, 'end': 5.4}},\n", + " {'material_description': 'kompakter Sandstein',\n", + " 'depth_interval': {'start': 5.4, 'end': 6.0}}]" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tp += len(ground_truth[random_keys[5]][\"layers\"])\n", + "total_number += len(ground_truth[random_keys[5]][\"layers\"])\n", + "ground_truth[random_keys[5]][\"layers\"] # all correct" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "6a1d3ec5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'material_description': 'Enrobé (e= 10 cm) sur grave de fondation assez limoneuse.',\n", + " 'depth_interval': {'start': 0.0, 'end': 0.8}},\n", + " {'material_description': 'Limon sablo-argileux, gris-beige, ferme, à cailloux plus ou moins abondants et morceaux de briques.',\n", + " 'depth_interval': {'start': 0.8, 'end': 1.5}},\n", + " {'material_description': 'Gravier à éléments roulés hétérométriques (dm. max. obs. 10 cm) dans matrice sableuse plus ou moins limoneuse, gris-beige, pulvérulente à assez cohérente dans les zones les plus limoneuses. Assise compacte à très compacte, perméable à semi-perméable.',\n", + " 'depth_interval': {'start': 1.5, 'end': 3.4}},\n", + " {'material_description': \"Gravier à éléments roulés hétérométriques (dm. max. obs. 10 cm) dans matrice très limoneuse semi-cohérente et peu perméable. Assise compacte à très compacte, perméable à semi-perméable.\\nPetit venue d'eau vers 4.0m (11-12.2.2004).\",\n", + " 'depth_interval': {'start': 3.4, 'end': 4.3}},\n", + " {'material_description': 'Limon argileux, gris, à feuillets légèrement oxydés au sommet, irrégulièrement stratifié, souvent à délits de limon parfois finement sableux, ferme, plastique. Quelques cailloux isolés. Assise quasi-imperméable, de faible à moyenne portance.',\n", + " 'depth_interval': {'start': 4.3, 'end': 8.8}},\n", + " {'material_description': \"Argile limoneuse, grise, le plus souvent d'aspect massif, localement grossièrement stratifiée avec du limon argileux, tendre, plastique à très plastique. Rares petits cailloux dispersés dans la masse. Assise quasi-imperméable, de faible portance.\",\n", + " 'depth_interval': {'start': 8.8, 'end': 14.0}},\n", + " {'material_description': 'Zone à prédominance limono-argileuse, irrégulièrement stratifiée, tendre à ferme, moins plastique.',\n", + " 'depth_interval': {'start': 14.0, 'end': 15.0}}]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fp += 1\n", + "tp += len(ground_truth[random_keys[6]][\"layers\"]) - 1\n", + "total_number += len(ground_truth[random_keys[6]][\"layers\"])\n", + "ground_truth[random_keys[6]][\"layers\"] # fourth layer deviates" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "543479c2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'material_description': 'Bitume.',\n", + " 'depth_interval': {'start': 0.0, 'end': 0.1}},\n", + " {'material_description': \"Grave d'infrastructure.\",\n", + " 'depth_interval': {'start': 0.1, 'end': 0.5}},\n", + " {'material_description': 'Limon argileux à poussées finement sableuses, brun-jaunâtre (oxydation), texture grumeleuse à délits pelliculaires de sable fin, ferme, assez plastique.',\n", + " 'depth_interval': {'start': 0.5, 'end': 2.3}},\n", + " {'material_description': \"Argile peu limoneuse, brun verdâtre à rubanage limonitique ocre, mal stratifiée à massive, nombreux petits coquillages d'eau douce et esquilles charbonneuses. Assise ferme à tendre, plastique, imperméable.\",\n", + " 'depth_interval': {'start': 2.3, 'end': 2.8}},\n", + " {'material_description': 'Argile peu limoneuse, grise rubanée de noir (traces organiques diffuses), avec débris de mollusques et esquilles de charbon, tendre, très plastique.',\n", + " 'depth_interval': {'start': 2.8, 'end': 3.9}},\n", + " {'material_description': 'Limon sablo-argileux gris-sombre, avec nombreux fragments de plantes palustres plus ou moins lignitisés (voir niveaux à tourbe) et coquillages de mollusques; interstratifié par de belles zones très organiques (tourbeuses). Assise tendre de faible compacité, compressible.\\nNappe phréatique à 5.62 m (3.4.89).',\n", + " 'depth_interval': {'start': 3.9, 'end': 5.7}},\n", + " {'material_description': \"Sable fin limoneux et limons sableux, gris sombre à fragments de plantes palustres plus ou moins lignitisés et coquillages de mollusques; mal stratifié, tendre à mou. Fluent à l'excavation sous l'eau.\",\n", + " 'depth_interval': {'start': 5.7, 'end': 6.3}},\n", + " {'material_description': \"Dépôts de laisse. Limon finement sableux, gris-sombre, assez bien varvé, quelques coquillages et débris organiques. Assise lâche. Fluente sous l'eau.\",\n", + " 'depth_interval': {'start': 6.3, 'end': 6.7}},\n", + " {'material_description': 'Gravier à galets roulés et cailloux hétérométriques (dm. max obs. 4,00 cm) petits éléments dominants dans matrice très limoneuse, cohérent, très dure, gris, pulvérulent avec une très bonne cohésion dans les zones à prédominances limoneuses. Assise peu à moyennement compacte, perméable.',\n", + " 'depth_interval': {'start': 6.7, 'end': 7.2}},\n", + " {'material_description': 'Sable moyen et grossier, pratiquement dépourvu de cailloux, saturé. Assise peu à moyennement compacte, perméable.',\n", + " 'depth_interval': {'start': 7.2, 'end': 8.6}},\n", + " {'material_description': 'Gravier à galets roulés et cailloux hétérométriques (dm. max obs. 4,00 cm) petits éléments dominants dans matrice de sable fin parfois très limoneux, gris, pulvérulent avec une très bonne cohésion dans les zones à prédominances limoneuses. Assise peu à moyennement compacte, perméable.',\n", + " 'depth_interval': {'start': 8.6, 'end': 9.1}},\n", + " {'material_description': 'Sable fin et moyen avec petits cailloux isolés et blocaux à la base. Assise peu à moyennement compacte, perméable.',\n", + " 'depth_interval': {'start': 9.1, 'end': 9.6}},\n", + " {'material_description': 'Limon argileux finement sableux, gris, quelques cailloux isolés au sommet, bien stratifié, ferme, assez plastique. Assise imperméable.',\n", + " 'depth_interval': {'start': 9.6, 'end': 11.9}},\n", + " {'material_description': \"Argile limoneuse, gris sombre, parfois stratifiée par de belles veines limoneuses et quelques filets de sable fin. Molle à tendre dans l'ensemble, avec niveaux plus argileux très mous, plastique.\",\n", + " 'depth_interval': {'start': 11.9, 'end': 14.1}},\n", + " {'material_description': \"Argile peu ou pas limoneuse, gris sombre, massive. Molle à tendre dans l'ensemble, avec niveaux plus argileux très mous, très plastique.\",\n", + " 'depth_interval': {'start': 14.1, 'end': 15.0}}]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fp += 5\n", + "tp += len(ground_truth[random_keys[7]][\"layers\"]) - 5\n", + "total_number += len(ground_truth[random_keys[7]][\"layers\"])\n", + "ground_truth[random_keys[7]][\"layers\"] # sixth layer has an additional sentence in ground truth\n", + "# same for 6.7 - 7.2.\n", + "# Same for 9.1 - 9.6\n", + "# 11.9 - 14.1 text taken from two different areas on the pdf. text is joint together not in sequential order\n", + "# 14.1 - 15.0 different wording\n" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "9a293a1d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'material_description': 'Humus',\n", + " 'depth_interval': {'start': 0.0, 'end': 0.25}},\n", + " {'material_description': 'leicht siltiger, sandiger Kies (rund), grau bis braun',\n", + " 'depth_interval': {'start': 0.25, 'end': 1.3}},\n", + " {'material_description': 'siltiger Fein- bis Mittelsand mit reichlich Kies, braungrau, bis grau',\n", + " 'depth_interval': {'start': 1.3, 'end': 2.8}},\n", + " {'material_description': 'zersetzter Torf, schwarz',\n", + " 'depth_interval': {'start': 2.8, 'end': 3.3}},\n", + " {'material_description': 'Silt, sehr weich, organisch (mit Torflagen), dunkelgrau',\n", + " 'depth_interval': {'start': 3.3, 'end': 3.85}},\n", + " {'material_description': 'ziemlich zersetzter Torf, dunkelbraun',\n", + " 'depth_interval': {'start': 3.85, 'end': 4.45}},\n", + " {'material_description': 'leicht toniger Silt, weich, geschichtet, grau, wenig organische Beimengungen (Holzresten)',\n", + " 'depth_interval': {'start': 4.45, 'end': 5.2}},\n", + " {'material_description': 'sandiger Silt mit wenig bis reichlich Kies, zum Teil kantig, grau (bis 5.7 m aufgeweicht, anschliessend relativ kompakt, moräneartig)',\n", + " 'depth_interval': {'start': 5.2, 'end': 6.6}},\n", + " {'material_description': 'siltiger Fein- bis Mittelsand, wenig Kies, grau',\n", + " 'depth_interval': {'start': 6.6, 'end': 7.1}},\n", + " {'material_description': 'toniger Silt mit Sand, steif, grau (ab 8.0 m mit wenig Kies)',\n", + " 'depth_interval': {'start': 7.1, 'end': 8.5}},\n", + " {'material_description': 'Silt und Sand mit reichlich Kies, zum Teil kantig, grau',\n", + " 'depth_interval': {'start': 8.5, 'end': 9.9}},\n", + " {'material_description': 'tonig-siltiger Sand',\n", + " 'depth_interval': {'start': 9.9, 'end': 10.0}},\n", + " {'material_description': 'Silt und Sand mit reichlich Kies, zum Teil kantig, grau',\n", + " 'depth_interval': {'start': 10.0, 'end': 13.0}},\n", + " {'material_description': 'siltiger Sand',\n", + " 'depth_interval': {'start': 13.0, 'end': 13.5}},\n", + " {'material_description': 'Silt und Sand mit reichlich Kies, zum Teil kantig, grau',\n", + " 'depth_interval': {'start': 13.5, 'end': 14.6}},\n", + " {'material_description': 'sauberer bis leicht siltiger Fein- bis Grobsand mit viel Kies, grau',\n", + " 'depth_interval': {'start': 14.6, 'end': 15.3}},\n", + " {'material_description': 'Silt und Sand mit wenig bis reichlich Kies, zum Teil kantig, grau',\n", + " 'depth_interval': {'start': 15.3, 'end': 15.5}}]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fp += 4\n", + "tp += len(ground_truth[random_keys[8]][\"layers\"]) - 4\n", + "total_number += len(ground_truth[random_keys[8]][\"layers\"])\n", + "ground_truth[random_keys[8]][\"layers\"]\n", + "# one layer in the pdf is divided into multiple layers in the ground truth. Drawing shows that the layer might actually be divided.\n", + "# 9.9 - 10.0 layer present in ground truth but not in pdf\n", + "# 10.0 - 13\n", + "# 13.0 - 13.5\n", + "# 13.5 - 14.6\n" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "d686b1b3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'material_description': 'sandiger (MS - VCS) Kies (Durchmesser 2 - 30 mm) grau-braun bis bunt',\n", + " 'depth_interval': {'start': 0.0, 'end': 12.0}},\n", + " {'material_description': 'sauberer Sand (FS-CS) braun-grau',\n", + " 'depth_interval': {'start': 12.0, 'end': 18.0}},\n", + " {'material_description': 'sandiger (FS-VCS) Kies (Durchmesser 2 - 15 mm) bunt',\n", + " 'depth_interval': {'start': 18.0, 'end': 44.0}},\n", + " {'material_description': 'sehr sandiger (FS - CS) Kies (Durchmesser 2 - 20 mm), braun',\n", + " 'depth_interval': {'start': 44.0, 'end': 46.0}},\n", + " {'material_description': 'leicht kiesiger (Durchmesser 2 - 5 mm) Sand (FS - VCS)',\n", + " 'depth_interval': {'start': 46.0, 'end': 48.0}},\n", + " {'material_description': 'leicht sandiger (MS - CS) Kies (Durchmesser 2 - 20 mm) bunt mit relativ starken Wasseraustritten',\n", + " 'depth_interval': {'start': 48.0, 'end': 56.0}},\n", + " {'material_description': 'sauberer Sand (FS - VCS) braun',\n", + " 'depth_interval': {'start': 56.0, 'end': 64.0}},\n", + " {'material_description': 'bunte Mergel grau-braun bis rötlich',\n", + " 'depth_interval': {'start': 64.0, 'end': 68.0}}]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fn += 1\n", + "fp += 6\n", + "tp += len(ground_truth[random_keys[9]][\"layers\"]) - 7\n", + "total_number += len(ground_truth[random_keys[9]][\"layers\"])\n", + "ground_truth[random_keys[9]][\"layers\"]\n", + "# layer 1: words in wrong order\n", + "# layer 4: words in wrong order\n", + "# layer 5: words in wrong order\n", + "# layer 6: words in wrong order, additional information in ground truth that is not in material description\n", + "# layer 7: slightly different information\n", + "# layer 8: additional information in ground truth not present in material description\n", + "# layer 9 - 14: missing in ground truth" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "d7a1baac", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The total number of layers is 87. The precision is 0.83 and the recall is 0.98. F1 is 0.90.\n" + ] + } + ], + "source": [ + "print(f\"The total number of layers is {total_number}. The precision is {tp / (tp + fp):.2f} and the recall is {tp / (tp + fn):.2f}. F1 is {2 * tp / (2 * tp + fp + fn):.2f}.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31917962", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aca08bb7", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d696e5e9", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}