Skip to content

Commit

Permalink
Added the possibility to exclude datasets that should not be indexed
Browse files Browse the repository at this point in the history
  • Loading branch information
simeonwetzel committed Aug 8, 2024
1 parent d98a57d commit c992e3a
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 8 deletions.
10 changes: 9 additions & 1 deletion search-app/server/config/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,17 @@
"TAVILY_API_KEY": "",
"SDSA_API_KEY": "demo-api-key",
"pygeoapi_instances": [
{"url": "https://api.weather.gc.ca/",
"exclude_collections": [
"https://api.weather.gc.ca/collections/climate:dcs:historical:seasonal:absolute?lang=en",
"https://api.weather.gc.ca/collections/climate:dcs:projected:monthly:absolute?lang=en"
]
}
],
"web_geojson_resources": [
"https://webais.demo.52north.org/pygeoapi"
],
"local_files": [
"local_geojson_files": [
"./data/"
]
}
2 changes: 1 addition & 1 deletion search-app/server/connectors/geojson_osm.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def __init__(self, file_dir: str = None, tag_name: str = "building"):
self.features = self._filter_meaningful_features(gj, tag_name)
else:
if not file_dir:
file_dir = config.local_files
file_dir = config.local_geojson_files
logging.info(f"Looking for files in following dir: {file_dir[0]}")
gj_files = []
for file in glob.glob(f"{file_dir[0]}*.geojson"):
Expand Down
23 changes: 17 additions & 6 deletions search-app/server/connectors/pygeoapi_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import List, Dict
from langchain.schema import Document
from config.config import Config
import re

logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)
Expand All @@ -15,7 +16,7 @@ def __init__(self, urls: List[str] = None):
if urls:
self.urls = urls
else:
self.urls = config.pygeoapi_instances
self.instances = config.pygeoapi_instances

async def _get_queryables(self, session: aiohttp.ClientSession, collection_id: str, base_url: str) -> dict:
"""
Expand All @@ -32,16 +33,26 @@ async def _get_queryables(self, session: aiohttp.ClientSession, collection_id: s
return queryables['properties']
return {}

async def _get_collections(self, base_url: str) -> List[dict]:
async def _get_collections(self, instance) -> List[dict]:
"""
Get all collections of a pygeoapi instance asynchronously.
"""
base_url = instance["url"]
logging.info(f"Fetching collections of pygeoapi instance: {base_url}")

exclude_urls = instance["exclude_collections"]
pattern = r'collections/([^?]+)'
exclude_collections = list(map(lambda url: re.search(pattern, url).group(1), exclude_urls))

logging.info(f"Excluding following collections from indexing operation: {exclude_collections}")


async with aiohttp.ClientSession() as session:
async with session.get(f'{base_url}/collections/') as response:
if response.status == 200:
collections = await response.json()
# exluding collections
collections['collections'] = [coll for coll in collections['collections'] if coll['id'] not in exclude_collections]
logging.debug(collections)

tasks = [
Expand Down Expand Up @@ -76,12 +87,12 @@ def _generate_docs(self, base_url:str, collections: List[dict]) -> List[Document
"extent": str(doc["extent"])}) for doc in collections]
return docs

async def get_collections_and_generate_docs(self, url) -> Document:
collections = await self._get_collections(url)
docs = self._generate_docs(url, collections)
async def get_collections_and_generate_docs(self, instance) -> Document:
collections = await self._get_collections(instance)
docs = self._generate_docs(instance["url"], collections)
return docs

async def get_docs_for_all_instances(self) -> List[Document]:
tasks = [self.get_collections_and_generate_docs(url) for url in self.urls]
tasks = [self.get_collections_and_generate_docs(instance) for instance in self.instances]
all_docs = await asyncio.gather(*tasks)
return all_docs[0]

0 comments on commit c992e3a

Please sign in to comment.