diff --git a/search-app/server/app/server.py b/search-app/server/app/server.py index 98dab6c..1394600 100644 --- a/search-app/server/app/server.py +++ b/search-app/server/app/server.py @@ -69,16 +69,22 @@ async def get_current_session(session_id: UUID = Depends(cookie), session_data: } # Add indexer for local geojson with OSM features -local_file_indexer = Indexer(index_name="geojson", - score_treshold=0.4, +geojson_osm_indexer = Indexer(index_name="geojson", + score_treshold=-400.0, k = 20, - #use_hf_model=True, - #embedding_model="ellenhp/osm2vec-bert-v1" + use_hf_model=True, + embedding_model="Alibaba-NLP/gte-large-en-v1.5" ) + + # Add connection to local file including building features # Replace the value for tag_name argument if you have other data -local_file_connector = GeoJSON(tag_name="building") +geojson_osm_connector = GeoJSON(tag_name="building") +""" +local_file_connector = GeoJSON(file_dir="https://webais.demo.52north.org/pygeoapi/collections/dresden_buildings/items", + tag_name="building") +""" app = FastAPI() @@ -179,11 +185,12 @@ async def fetch_documents(indexing: bool=True, api_key: APIKey = Depends(get_api } } -@app.get("/index_local_files") -async def index_local_files(api_key: APIKey = Depends(get_api_key)): +@app.get("/index_geojson_osm_features") +async def index_geojson_osm(api_key: APIKey = Depends(get_api_key)): # await local_file_connector.add_descriptions_to_features() - feature_docs = await local_file_connector._features_to_docs() - res_local = local_file_indexer._index(documents=feature_docs) + feature_docs = await geojson_osm_connector._features_to_docs() + logging.info(f"Converted {len(feature_docs)} Features or FeatureGroups to documents") + res_local = geojson_osm_indexer._index(documents=feature_docs) return res_local def generate_combined_feature_collection(doc_list: List[Document]): @@ -205,21 +212,26 @@ def generate_combined_feature_collection(doc_list: List[Document]): @app.get("/retrieve_geojson") async def retrieve_geojson(query: str): - features = local_file_indexer.retriever.invoke(query) + features = geojson_osm_indexer.retriever.invoke(query) return generate_combined_feature_collection(features) @app.get("/clear_index") async def clear_index(index_name: str, api_key: APIKey = Depends(get_api_key)): - if index_name not in (indexes, 'geojson'): + if index_name not in indexes and index_name != 'geojson': raise HTTPException(status_code=400, detail="Invalid index name") - elif index_name == 'geojson': - local_file_indexer._clear() - else: + + if index_name == 'geojson': + logging.info("Clearing geojson index") + geojson_osm_indexer._clear() + else: + logging.info(f"Clearing index: {index_name}") indexes[index_name]._clear() + return {'message': 'Index cleared'} + @app.get("/retrieve_with_id") async def retrieve_with_id(index_name: str, _id: str): if index_name not in indexes: diff --git a/search-app/server/connectors/geojson_osm.py b/search-app/server/connectors/geojson_osm.py index 32fd3f0..1799c68 100644 --- a/search-app/server/connectors/geojson_osm.py +++ b/search-app/server/connectors/geojson_osm.py @@ -6,15 +6,22 @@ import glob import asyncio import aiohttp +import requests from bs4 import BeautifulSoup from tqdm.asyncio import tqdm import json +import re logging.basicConfig() logging.getLogger().setLevel(logging.INFO) config = Config('./config/config.json') +def is_url(string): + # Regex pattern for matching URLs + pattern = re.compile(r'^(https?|ftp):\/\/[^\s/$.?#].[^\s]*$') + return re.match(pattern, string) is not None + class GeoJSON(): """ The GeoJSON class processes GeoJSON files containing OpenStreetMap (OSM) features, @@ -52,26 +59,47 @@ class GeoJSON(): _features_to_docs() -> List[Document]: Converts features into a list of Document objects for further use. """ - def __init__(self, file_dir: str = None, tag_name: str = "building"): - if file_dir is None: - file_dir = config.local_files - logging.info(f"Looking for files in following dir: {file_dir[0]}") - gj_files = [] - for file in glob.glob(f"{file_dir[0]}*.geojson"): - logging.info(f"Extracting features from file: {file}") - with open(file) as f: - gj = geojson.load(f) - gj_files.extend(gj['features']) - # Todo: Maybe add filename to the properties of each feature - - self.features = self._filter_meaningful_features(gj_files, tag_name) - """ - feature_collection = geojson.FeatureCollection(self.features) - with open(f'{file_dir[0]}local_file.geojson', 'w') as f: - geojson.dump(feature_collection, f) - """ - + def __init__(self, file_dir: str = None, tag_name: str = "building"): + if file_dir and is_url(file_dir): + """We assume the online resource to be a collection published via a PyGeoAPI instance""" + logging.info("Getting features from online resource") + params = {"f": "json", "limit": 10000} + gj = self._fetch_features_from_online_resource(file_dir, params) + print(f"Retrieved {len(gj)} features") + + self.features = self._filter_meaningful_features(gj, tag_name) + else: + if not file_dir: + file_dir = config.local_files + logging.info(f"Looking for files in following dir: {file_dir[0]}") + gj_files = [] + for file in glob.glob(f"{file_dir[0]}*.geojson"): + logging.info(f"Extracting features from file: {file}") + with open(file) as f: + gj = geojson.load(f) + gj_files.extend(gj['features']) + # Todo: Maybe add filename to the properties of each feature + + self.features = self._filter_meaningful_features(gj_files, tag_name) + + logging.info(f"Recieved {len(self.features)} features") self.tag_name = tag_name + + def _fetch_features_from_online_resource(self, url, params): + offset = 0 + all_features = [] + + while True: + params['offset'] = offset + response = requests.get(url, params=params) + response_json = response.json() + features = response_json.get('features', []) + if not features: + break + all_features.extend(features) + offset += params['limit'] + + return all_features def _filter_meaningful_features(self, features: List[dict], tag_name: str) -> List[dict]: filtered_features = list(filter(lambda feature: feature.get("properties", {}).get(tag_name) != "yes", features)) @@ -107,6 +135,7 @@ async def _get_descriptions_for_tags(self) -> Dict[str, str]: return dict(zip(tasks.keys(), descriptions)) async def add_descriptions_to_features(self) -> None: + logging.info(f"Fetching descriptions for {len(self.features)} OSM features") tag_description_map = await self._get_descriptions_for_tags() for feature in self.features: @@ -160,7 +189,7 @@ async def _features_to_docs(self) -> List[Document]: properties = feature['properties'] name = properties.get("name", "Unknown") description = self._get_feature_description(feature) - page_content = f"Feature Name: {name}\n\n{description}" + page_content = f"Name: {name}\n\n{description}" metadata = { "type": properties.get(self.tag_name, "Unknown"), "feature": json.dumps(feature["geometry"] , indent=2), @@ -177,7 +206,7 @@ async def _features_to_docs(self) -> List[Document]: tag_docs = [] for tag, features in grouped_features.items(): tag_description = features[0]['properties'].get('description', tag.replace('=', ': ')) - page_content = f"Feature Type: {tag_description}\n\nThis collection includes {len(features)} features of type {tag}." + page_content = f"{tag}: {tag_description}\n\nThis collection includes {len(features)} features of type {tag}." metadata = { "tag": tag, "count": len(features), diff --git a/search-app/server/graph/prompts.py b/search-app/server/graph/prompts.py index 2cf1740..d5870af 100644 --- a/search-app/server/graph/prompts.py +++ b/search-app/server/graph/prompts.py @@ -3,7 +3,8 @@ def generate_conversation_prompt(): system_prompt = """ - **AI Instructions:** + **AI Instructions:**ยด + You are an AI designed to assist users in finding environmental or geospatial datasets. Follow these guidelines: 1. **Extract Search Criteria:** Identify the specific type of environmental or geospatial data the user is requesting. 2. **Refine the Search:** If the request is vague, ask follow-up questions about the time period, geographic area, resolution, or format to gather more details. Only re-ask maximum of 3 times per inquery and try to ask as less as possible. Use bold formatting (markdown) to highlight important aspects in your response. @@ -13,6 +14,10 @@ def generate_conversation_prompt(): 'You must always output a JSON object with an "answer" key and a "search_criteria" key.' If you have the impression that the user gives the go to search, do not ask follow-up questions and add a flag "ready_to_retrieve": "yes". + **Tips for Natural Interaction:** + - Maintain a friendly and conversational tone. + - Acknowledge user inputs and express appreciation for their responses. + - Keep responses clear and straightforward while ensuring they meet the user's needs. **Example Conversations:** @@ -66,7 +71,6 @@ def generate_conversation_prompt(): ], ) - return prompt diff --git a/search-app/server/indexing/indexer.py b/search-app/server/indexing/indexer.py index 8b5bc6c..d814c6d 100644 --- a/search-app/server/indexing/indexer.py +++ b/search-app/server/indexing/indexer.py @@ -25,7 +25,8 @@ def __init__( if use_hf_model: model_name = self.embedding_model - model_kwargs = {'device': 'cpu'} + model_kwargs = {'device': 'cpu', + 'trust_remote_code': True} encode_kwargs = {'normalize_embeddings': False} hf = HuggingFaceEmbeddings( model_name=model_name,