From c992e3a1a32d70e0016fc3c8014547e473e8a121 Mon Sep 17 00:00:00 2001 From: simeonwetzel Date: Thu, 8 Aug 2024 11:57:58 +0200 Subject: [PATCH] Added the possibility to exclude datasets that should not be indexed --- search-app/server/config/config.json | 10 +++++++- search-app/server/connectors/geojson_osm.py | 2 +- .../server/connectors/pygeoapi_retriever.py | 23 ++++++++++++++----- 3 files changed, 27 insertions(+), 8 deletions(-) diff --git a/search-app/server/config/config.json b/search-app/server/config/config.json index 95ba071..eb03072 100644 --- a/search-app/server/config/config.json +++ b/search-app/server/config/config.json @@ -3,9 +3,17 @@ "TAVILY_API_KEY": "", "SDSA_API_KEY": "demo-api-key", "pygeoapi_instances": [ + {"url": "https://api.weather.gc.ca/", + "exclude_collections": [ + "https://api.weather.gc.ca/collections/climate:dcs:historical:seasonal:absolute?lang=en", + "https://api.weather.gc.ca/collections/climate:dcs:projected:monthly:absolute?lang=en" + ] + } + ], + "web_geojson_resources": [ "https://webais.demo.52north.org/pygeoapi" ], - "local_files": [ + "local_geojson_files": [ "./data/" ] } \ No newline at end of file diff --git a/search-app/server/connectors/geojson_osm.py b/search-app/server/connectors/geojson_osm.py index 1799c68..4f9d7eb 100644 --- a/search-app/server/connectors/geojson_osm.py +++ b/search-app/server/connectors/geojson_osm.py @@ -70,7 +70,7 @@ def __init__(self, file_dir: str = None, tag_name: str = "building"): self.features = self._filter_meaningful_features(gj, tag_name) else: if not file_dir: - file_dir = config.local_files + file_dir = config.local_geojson_files logging.info(f"Looking for files in following dir: {file_dir[0]}") gj_files = [] for file in glob.glob(f"{file_dir[0]}*.geojson"): diff --git a/search-app/server/connectors/pygeoapi_retriever.py b/search-app/server/connectors/pygeoapi_retriever.py index 9c3a96a..c81e138 100644 --- a/search-app/server/connectors/pygeoapi_retriever.py +++ b/search-app/server/connectors/pygeoapi_retriever.py @@ -4,6 +4,7 @@ from typing import List, Dict from langchain.schema import Document from config.config import Config +import re logging.basicConfig() logging.getLogger().setLevel(logging.INFO) @@ -15,7 +16,7 @@ def __init__(self, urls: List[str] = None): if urls: self.urls = urls else: - self.urls = config.pygeoapi_instances + self.instances = config.pygeoapi_instances async def _get_queryables(self, session: aiohttp.ClientSession, collection_id: str, base_url: str) -> dict: """ @@ -32,16 +33,26 @@ async def _get_queryables(self, session: aiohttp.ClientSession, collection_id: s return queryables['properties'] return {} - async def _get_collections(self, base_url: str) -> List[dict]: + async def _get_collections(self, instance) -> List[dict]: """ Get all collections of a pygeoapi instance asynchronously. """ + base_url = instance["url"] logging.info(f"Fetching collections of pygeoapi instance: {base_url}") + exclude_urls = instance["exclude_collections"] + pattern = r'collections/([^?]+)' + exclude_collections = list(map(lambda url: re.search(pattern, url).group(1), exclude_urls)) + + logging.info(f"Excluding following collections from indexing operation: {exclude_collections}") + + async with aiohttp.ClientSession() as session: async with session.get(f'{base_url}/collections/') as response: if response.status == 200: collections = await response.json() + # exluding collections + collections['collections'] = [coll for coll in collections['collections'] if coll['id'] not in exclude_collections] logging.debug(collections) tasks = [ @@ -76,12 +87,12 @@ def _generate_docs(self, base_url:str, collections: List[dict]) -> List[Document "extent": str(doc["extent"])}) for doc in collections] return docs - async def get_collections_and_generate_docs(self, url) -> Document: - collections = await self._get_collections(url) - docs = self._generate_docs(url, collections) + async def get_collections_and_generate_docs(self, instance) -> Document: + collections = await self._get_collections(instance) + docs = self._generate_docs(instance["url"], collections) return docs async def get_docs_for_all_instances(self) -> List[Document]: - tasks = [self.get_collections_and_generate_docs(url) for url in self.urls] + tasks = [self.get_collections_and_generate_docs(instance) for instance in self.instances] all_docs = await asyncio.gather(*tasks) return all_docs[0] \ No newline at end of file