remove errors and query optimization

dieterich-lab · Oct 7, 2021 · 9e81b6f · 9e81b6f
1 parent 9137a65
commit 9e81b6f
Show file tree

Hide file tree

Showing 11 changed files with 146 additions and 11,392 deletions.
diff --git a/README.md b/README.md
@@ -34,7 +34,7 @@ Not recommended for pure deployment.
 * Develop
 
 
-## Data Import ### [(detailed documentation)](https://github.com/dieterich-lab/medex/tree/time-series/dataset_examples/Data_import.md)
+### Data Import   [(detailed documentation)](https://github.com/dieterich-lab/medex/tree/time-series/dataset_examples/Data_import.md)
 * Database imports run every night at 5:05 and at startup.
 * The database is only updated if there is new data to import.
 * In order to add new data add a new `entities.csv` and `dataset.csv` to the `./import` folder.

diff --git a/dataset_examples/dataset_test.csv b/dataset_examples/dataset_test.csv
diff --git a/modules/import_dataset_postgre.py b/modules/import_dataset_postgre.py
@@ -98,7 +98,7 @@ def load_data(entities, dataset, header, rdb):
             if 'Visit' in header:
                 line = [i] + row[0:6] + [";".join([str(x) for x in row[6:]])]
             else:
-                line = [i] + row[0:1] + [1] + row[1:5] + [";".join([str(x) for x in row[5:]])]
+                line = [i] + row[0:2] + [1] + row[2:5] + [";".join([str(x) for x in row[5:]])]
             if len(line) < 6:
                 print("This line doesn't have appropriate format:", line)
             else:
@@ -144,6 +144,8 @@ def alter_table(rdb):
 
     sql12 = """CREATE INDEX IF NOT EXISTS "Key_index_numerical" ON examination_numerical ("Key")"""
     sql13 = """CREATE INDEX IF NOT EXISTS "Key_index_categorical" ON examination_categorical ("Key")"""
+    sql15 = """CREATE INDEX IF NOT EXISTS "ID_index_numerical" ON examination_numerical ("Name_ID")"""
+    sql16 = """CREATE INDEX IF NOT EXISTS "ID_index_categorical" ON examination_categorical ("Name_ID")"""
     sql14 = """CREATE EXTENSION IF NOT EXISTS tablefunc"""
 
     try:
@@ -168,6 +170,8 @@ def alter_table(rdb):
         cur.execute(sql12)
         cur.execute(sql13)
         cur.execute(sql14)
+        cur.execute(sql15)
+        cur.execute(sql16)
         rdb.commit()
     except Exception:
         return print("Problem with connection with database")

diff --git a/modules/import_scheduler.py b/modules/import_scheduler.py
@@ -73,11 +73,12 @@ def start_import(rdb):
         return print("Data set not changed", file=sys.stderr)
     else:
         if not os.path.isfile(header):
-            header = ['Name_ID', 'measurement']
+            header = ['Name_ID','Case_ID', 'measurement']
         else:
             with open(header, 'r') as in_file:
                 for row in in_file:
                     header = row.replace("\n", "").split(",")
+                header = header[0:3]
         # use function from import_dataset_postgre.py to create tables in database
         print("Start create tables")
         idp.create_table(rdb)

diff --git a/modules/load_data_postgre.py b/modules/load_data_postgre.py
diff --git a/url_handlers/boxplot.py b/url_handlers/boxplot.py
@@ -46,21 +46,23 @@ def post_boxplots():
 
     # handling errors and load data from database
     error = None
-    if not measurement:
+    if measurement == "Search entity":
         error = "Please select number of {}".format(measurement_name)
     elif numeric_entities == "Search entity" or categorical_entities == "Search entity":
         error = "Please select entity"
     elif not subcategory_entities:
         error = "Please select subcategory"
-    if not error:
-        df, error = ps.get_num_cat_values(numeric_entities, categorical_entities, subcategory_entities, measurement,case_ids,
+    elif not error:
+        print(subcategory_entities)
+        df, error = ps.get_num_cat_values(numeric_entities, categorical_entities, subcategory_entities, measurement, case_ids,
                                           categorical_filter, categorical_names, numerical_filter_name, from1, to1,
                                           measurement_filter, date, rdb)
-        df = filtering.checking_for_block(block, df, Name_ID, measurement_name)
+        df = df.rename(columns={"Name_ID": "{}".format(Name_ID), "measurement": "{}".format(measurement_name)})
         numeric_entities_unit, error = ps.get_unit(numeric_entities, rdb)
         if numeric_entities_unit:
             numeric_entities_unit = numeric_entities + ' (' + numeric_entities_unit + ')'
-            df.columns =  [Name_ID,measurement_name, numeric_entities_unit,categorical_entities]
+            print(df)
+            df.columns = [Name_ID,measurement_name, numeric_entities_unit,categorical_entities]
         else:
             numeric_entities_unit = numeric_entities
         if not error:

diff --git a/url_handlers/data.py b/url_handlers/data.py
@@ -21,6 +21,7 @@ def get_data():
     numerical_filter = filtering.check_for_numerical_filter_get()
     categorical_filter, categorical_names = filtering.check_for_filter_get()
     return render_template('data.html',
+                           block=block,
                            all_entities=all_entities,
                            name=measurement_name,
                            start_date=session.get('start_date'),
@@ -62,6 +63,7 @@ def post_data():
     if error:
         return render_template('data.html',
                                error=error,
+                               block=block,
                                all_entities=all_entities,
                                all_measurement=all_measurement,
                                name=measurement_name,
@@ -102,6 +104,7 @@ def post_data():
 
     return render_template('data.html',
                            error=error,
+                           block=block,
                            all_entities=all_entities,
                            all_measurement=all_measurement,
                            measurement=measurement,

diff --git a/url_handlers/filtering.py b/url_handlers/filtering.py
@@ -29,6 +29,7 @@ def check_for_filter_post():
     session['categorical_filter'] = categorical_filter
     session['categorical_names'] = categorical_names
     categorical_filter_zip = None
+    print(categorical_filter)
     if categorical_filter is not None:
         categorical_filter_zip = zip(categorical_names, categorical_filter)
 

diff --git a/url_handlers/histogram.py b/url_handlers/histogram.py
@@ -62,6 +62,7 @@ def post_statistics():
         numeric_entities_unit, error = ps.get_unit(numeric_entities, rdb)
         if numeric_entities_unit:
             numeric_entities_unit = numeric_entities + ' (' + numeric_entities_unit + ')'
+            print(df)
             df.columns = [Name_ID,measurement_name, numeric_entities_unit,categorical_entities]
         else:
             numeric_entities_unit = numeric_entities

diff --git a/url_handlers/scatter_plot.py b/url_handlers/scatter_plot.py
@@ -67,6 +67,7 @@ def post_plots():
                                                        categorical_filter, categorical_names, numerical_filter_name,
                                                        from1, to1, measurement_filter, date, rdb)
 
+
         x_unit, error = ps.get_unit(x_axis, rdb)
         y_unit, error = ps.get_unit(y_axis, rdb)
         if x_unit and y_unit:
@@ -92,7 +93,7 @@ def post_plots():
                                           from1, to1, measurement_filter, date, rdb)
             if not error:
 
-                categorical_df = numeric_df.merge(df, on="Patient_ID").dropna()
+                categorical_df = numeric_df.merge(df, on="Name_ID").dropna()
                 categorical_df = categorical_df.sort_values(by=[categorical_entities])
                 categorical_df = categorical_df.rename(
                     columns={"Name_ID": "{}".format(Name_ID), "measurement": "{}".format(measurement_name)})

diff --git a/webserver.py b/webserver.py
@@ -63,6 +63,27 @@ def check_for_env(key: str, default=None, cast=None):
     block = 'block'
 
 
+# data store for filters and download
+class DataStore():
+
+    case_ids = []
+    table_case_ids = None
+
+    # for table browser server side
+    table_schema = None
+    table_browser_column = None
+    dict = None
+    table_browser_entities = None
+    table_browser_what_table = None
+    table_browser_column2 = None
+
+
+
+
+table_builder = TableBuilder()
+data = DataStore()
+
+
 # favicon
 @app.route('/favicon.ico')
 def favicon():
@@ -73,8 +94,8 @@ def favicon():
 # information about database
 @app.context_processor
 def message_count():
-    case_id = session.get('case_ids')
-    if case_id != None:
+    case_id = data.case_ids
+    if case_id :
         case_display = 'block'
     else:
         case_display = 'none'
@@ -106,21 +127,6 @@ def message_count():
                 case_display=case_display)
 
 
-# data store for filters and download
-class DataStore():
-
-    # for table browser server side
-    table_schema = None
-    table_browser_column = None
-    dict = None
-    table_browser_entities = None
-    table_browser_what_table = None
-    table_browser_column2 = None
-
-
-table_builder = TableBuilder()
-data = DataStore()
-
 # Urls in the 'url_handlers' directory (one file for each new url)
 # import a Blueprint
 from url_handlers.data import data_page
@@ -167,8 +173,8 @@ def get_cases():
     session_id_json = {"session_id": "{}".format(session_id)}
     cases_get = requests.post(EXPRESS_MEDEX_MEDDUSA_URL, json=session_id_json)
     case_ids = cases_get.json()
-    session['case_ids'] = case_ids['cases_ids']
-    session['table_case_ids'] = pd.DataFrame(case_ids['cases_ids'], columns=["Case_ID"]).to_csv(index=False)
+    data.case_ids = case_ids['cases_ids']
+    data.table_case_ids = pd.DataFrame(case_ids['cases_ids'], columns=["Case_ID"]).to_csv(index=False)
 
     return redirect('/data')
 
@@ -179,7 +185,7 @@ def download(filename):
     if filename == 'data.csv':
         csv = data.csv
     elif filename == 'case_ids.csv':
-        csv = session.get('table_case_ids')
+        csv = data.table_case_ids
     # Create a string buffer
     buf_str = io.StringIO(csv)