Skip to content

Commit

Permalink
Refactored method names. Merged local and online geojson handling
Browse files Browse the repository at this point in the history
  • Loading branch information
simeonwetzel committed Aug 8, 2024
1 parent 684126d commit d98a57d
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 38 deletions.
40 changes: 26 additions & 14 deletions search-app/server/app/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,16 +69,22 @@ async def get_current_session(session_id: UUID = Depends(cookie), session_data:
}

# Add indexer for local geojson with OSM features
local_file_indexer = Indexer(index_name="geojson",
score_treshold=0.4,
geojson_osm_indexer = Indexer(index_name="geojson",
score_treshold=-400.0,
k = 20,
#use_hf_model=True,
#embedding_model="ellenhp/osm2vec-bert-v1"
use_hf_model=True,
embedding_model="Alibaba-NLP/gte-large-en-v1.5"
)


# Add connection to local file including building features
# Replace the value for tag_name argument if you have other data
local_file_connector = GeoJSON(tag_name="building")
geojson_osm_connector = GeoJSON(tag_name="building")

"""
local_file_connector = GeoJSON(file_dir="https://webais.demo.52north.org/pygeoapi/collections/dresden_buildings/items",
tag_name="building")
"""

app = FastAPI()

Expand Down Expand Up @@ -179,11 +185,12 @@ async def fetch_documents(indexing: bool=True, api_key: APIKey = Depends(get_api
}
}

@app.get("/index_local_files")
async def index_local_files(api_key: APIKey = Depends(get_api_key)):
@app.get("/index_geojson_osm_features")
async def index_geojson_osm(api_key: APIKey = Depends(get_api_key)):
# await local_file_connector.add_descriptions_to_features()
feature_docs = await local_file_connector._features_to_docs()
res_local = local_file_indexer._index(documents=feature_docs)
feature_docs = await geojson_osm_connector._features_to_docs()
logging.info(f"Converted {len(feature_docs)} Features or FeatureGroups to documents")
res_local = geojson_osm_indexer._index(documents=feature_docs)
return res_local

def generate_combined_feature_collection(doc_list: List[Document]):
Expand All @@ -205,21 +212,26 @@ def generate_combined_feature_collection(doc_list: List[Document]):

@app.get("/retrieve_geojson")
async def retrieve_geojson(query: str):
features = local_file_indexer.retriever.invoke(query)
features = geojson_osm_indexer.retriever.invoke(query)

return generate_combined_feature_collection(features)


@app.get("/clear_index")
async def clear_index(index_name: str, api_key: APIKey = Depends(get_api_key)):
if index_name not in (indexes, 'geojson'):
if index_name not in indexes and index_name != 'geojson':
raise HTTPException(status_code=400, detail="Invalid index name")
elif index_name == 'geojson':
local_file_indexer._clear()
else:

if index_name == 'geojson':
logging.info("Clearing geojson index")
geojson_osm_indexer._clear()
else:
logging.info(f"Clearing index: {index_name}")
indexes[index_name]._clear()

return {'message': 'Index cleared'}


@app.get("/retrieve_with_id")
async def retrieve_with_id(index_name: str, _id: str):
if index_name not in indexes:
Expand Down
71 changes: 50 additions & 21 deletions search-app/server/connectors/geojson_osm.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,22 @@
import glob
import asyncio
import aiohttp
import requests
from bs4 import BeautifulSoup
from tqdm.asyncio import tqdm
import json
import re

logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)

config = Config('./config/config.json')

def is_url(string):
# Regex pattern for matching URLs
pattern = re.compile(r'^(https?|ftp):\/\/[^\s/$.?#].[^\s]*$')
return re.match(pattern, string) is not None

class GeoJSON():
"""
The GeoJSON class processes GeoJSON files containing OpenStreetMap (OSM) features,
Expand Down Expand Up @@ -52,26 +59,47 @@ class GeoJSON():
_features_to_docs() -> List[Document]:
Converts features into a list of Document objects for further use.
"""
def __init__(self, file_dir: str = None, tag_name: str = "building"):
if file_dir is None:
file_dir = config.local_files
logging.info(f"Looking for files in following dir: {file_dir[0]}")
gj_files = []
for file in glob.glob(f"{file_dir[0]}*.geojson"):
logging.info(f"Extracting features from file: {file}")
with open(file) as f:
gj = geojson.load(f)
gj_files.extend(gj['features'])
# Todo: Maybe add filename to the properties of each feature

self.features = self._filter_meaningful_features(gj_files, tag_name)
"""
feature_collection = geojson.FeatureCollection(self.features)
with open(f'{file_dir[0]}local_file.geojson', 'w') as f:
geojson.dump(feature_collection, f)
"""

def __init__(self, file_dir: str = None, tag_name: str = "building"):
if file_dir and is_url(file_dir):
"""We assume the online resource to be a collection published via a PyGeoAPI instance"""
logging.info("Getting features from online resource")
params = {"f": "json", "limit": 10000}
gj = self._fetch_features_from_online_resource(file_dir, params)
print(f"Retrieved {len(gj)} features")

self.features = self._filter_meaningful_features(gj, tag_name)
else:
if not file_dir:
file_dir = config.local_files
logging.info(f"Looking for files in following dir: {file_dir[0]}")
gj_files = []
for file in glob.glob(f"{file_dir[0]}*.geojson"):
logging.info(f"Extracting features from file: {file}")
with open(file) as f:
gj = geojson.load(f)
gj_files.extend(gj['features'])
# Todo: Maybe add filename to the properties of each feature

self.features = self._filter_meaningful_features(gj_files, tag_name)

logging.info(f"Recieved {len(self.features)} features")
self.tag_name = tag_name

def _fetch_features_from_online_resource(self, url, params):
offset = 0
all_features = []

while True:
params['offset'] = offset
response = requests.get(url, params=params)
response_json = response.json()
features = response_json.get('features', [])
if not features:
break
all_features.extend(features)
offset += params['limit']

return all_features

def _filter_meaningful_features(self, features: List[dict], tag_name: str) -> List[dict]:
filtered_features = list(filter(lambda feature: feature.get("properties", {}).get(tag_name) != "yes", features))
Expand Down Expand Up @@ -107,6 +135,7 @@ async def _get_descriptions_for_tags(self) -> Dict[str, str]:
return dict(zip(tasks.keys(), descriptions))

async def add_descriptions_to_features(self) -> None:
logging.info(f"Fetching descriptions for {len(self.features)} OSM features")
tag_description_map = await self._get_descriptions_for_tags()

for feature in self.features:
Expand Down Expand Up @@ -160,7 +189,7 @@ async def _features_to_docs(self) -> List[Document]:
properties = feature['properties']
name = properties.get("name", "Unknown")
description = self._get_feature_description(feature)
page_content = f"Feature Name: {name}\n\n{description}"
page_content = f"Name: {name}\n\n{description}"
metadata = {
"type": properties.get(self.tag_name, "Unknown"),
"feature": json.dumps(feature["geometry"] , indent=2),
Expand All @@ -177,7 +206,7 @@ async def _features_to_docs(self) -> List[Document]:
tag_docs = []
for tag, features in grouped_features.items():
tag_description = features[0]['properties'].get('description', tag.replace('=', ': '))
page_content = f"Feature Type: {tag_description}\n\nThis collection includes {len(features)} features of type {tag}."
page_content = f"{tag}: {tag_description}\n\nThis collection includes {len(features)} features of type {tag}."
metadata = {
"tag": tag,
"count": len(features),
Expand Down
8 changes: 6 additions & 2 deletions search-app/server/graph/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@

def generate_conversation_prompt():
system_prompt = """
**AI Instructions:**
**AI Instructions:**´
You are an AI designed to assist users in finding environmental or geospatial datasets. Follow these guidelines:
1. **Extract Search Criteria:** Identify the specific type of environmental or geospatial data the user is requesting.
2. **Refine the Search:** If the request is vague, ask follow-up questions about the time period, geographic area, resolution, or format to gather more details. Only re-ask maximum of 3 times per inquery and try to ask as less as possible. Use bold formatting (markdown) to highlight important aspects in your response.
Expand All @@ -13,6 +14,10 @@ def generate_conversation_prompt():
'You must always output a JSON object with an "answer" key and a "search_criteria" key.'
If you have the impression that the user gives the go to search, do not ask follow-up questions and add a flag "ready_to_retrieve": "yes".
**Tips for Natural Interaction:**
- Maintain a friendly and conversational tone.
- Acknowledge user inputs and express appreciation for their responses.
- Keep responses clear and straightforward while ensuring they meet the user's needs.
**Example Conversations:**
Expand Down Expand Up @@ -66,7 +71,6 @@ def generate_conversation_prompt():
],
)


return prompt


Expand Down
3 changes: 2 additions & 1 deletion search-app/server/indexing/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ def __init__(

if use_hf_model:
model_name = self.embedding_model
model_kwargs = {'device': 'cpu'}
model_kwargs = {'device': 'cpu',
'trust_remote_code': True}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
model_name=model_name,
Expand Down

0 comments on commit d98a57d

Please sign in to comment.