From 545ae4b629a7345e4dcb45bb90c2a57202711685 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Mon, 28 Mar 2022 17:53:58 +0200 Subject: [PATCH 001/104] transformation of NPI data --- .../memilio/epidata/transformNPIData.py | 958 ++++++++++++++++++ 1 file changed, 958 insertions(+) create mode 100644 pycode/memilio-epidata/memilio/epidata/transformNPIData.py diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py new file mode 100644 index 0000000000..490b08a195 --- /dev/null +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -0,0 +1,958 @@ +############################################################################# +# Copyright (C) 2020-2021 German Aerospace Center (DLR-SC) +# +# Authors: Martin J. Kuehn +# +# Contact: Martin J. Kuehn +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################# +from datetime import datetime, timedelta +import time +import os +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from matplotlib import colors +from scipy.spatial.distance import pdist +from scipy.cluster import hierarchy +from sklearn.cluster import KMeans + +from memilio.epidata import getDataIntoPandasDataFrame as gd +from memilio.epidata import geoModificationGermany as geoger +from memilio.epidata import defaultDict as dd +from memilio.epidata import customPlot + + +def evaluate_clustering(corr_mat, idx_to_cluster_idx, indices_all): + """! Computes a score for a particular clustering based on the + correlation matrix. The score is computed as the percentage of 'higher' + or 'high' values (e.g., between 0.5 and 0.75 or 0.75 and 1) of the + correlation matrix that are to be found in the diagonal blocks of the + clustered correlation matrix vs these values in the offdiagonal blocks. + + @param corr_mat correlation matrix between the features / data set items + that were clustered. + @param idx_to_cluster_idx Mapping of data item to cluster index. + @param indices_all List of indices of all data items. + + @return Scores for the provided clustering. + """ + + if idx_to_cluster_idx.min() == 1: + idx_to_cluster_idx -= 1 + + # store indices of clusters + clusters = [[] for i in range(idx_to_cluster_idx.max()+1)] + for ii in range(len(idx_to_cluster_idx)): + clusters[idx_to_cluster_idx[ii]].append(ii) + # store remaining/perpendicular indices for all clusters + clusters_perp = [[] for i in range(idx_to_cluster_idx.max()+1)] + for ii in range(len(clusters)): + clusters_perp[ii] = list(indices_all.difference(set(clusters[ii]))) + # extract correlation values of block diagonals and offdiagonals separ. + corr_diag = [] + corr_offdiag = [] + for ii in range(len(clusters)): + corr_diag = np.append(corr_diag, abs( + corr_mat[np.ix_(clusters[ii], clusters[ii])].flatten())) + corr_offdiag = np.append(corr_offdiag, abs( + corr_mat[np.ix_(clusters[ii], clusters_perp[ii])].flatten())) + + corr_thresholds = [0.25, 0.5, 0.75] + cluster_quantification = np.zeros(6) + for ii in range(len(corr_thresholds)): + num_diag = len(np.where(corr_diag > corr_thresholds[ii])[0]) + num_offdiag = len(np.where(corr_offdiag > corr_thresholds[ii])[0]) + if ii < len(corr_thresholds)-1: + num_diag -= len(np.where(corr_diag > corr_thresholds[ii+1])[0]) + num_offdiag -= len(np.where(corr_offdiag > + corr_thresholds[ii+1])[0]) + cluster_quantification[2*ii] = num_diag / (num_diag+num_offdiag) + cluster_quantification[2*ii+1] = ( + num_diag+num_offdiag) / (len(indices_all)**2) + + # print scores on clustering + print("Number of clusters: " + str(len(clusters)) + + ", shares diag/all between [0.25, 0.5, 0.75]: %.4f" % + cluster_quantification[0] + " (%.4f" % cluster_quantification[1] + + "), " + " %.4f " % cluster_quantification[2] + " (%.4f" % + cluster_quantification[3] + "), " + " %.4f " % + cluster_quantification[4] + " (%.4f" % cluster_quantification[5] + ")") + + return cluster_quantification + + +def compute_hierarch_clustering(corr_mat, corr_pairwdist, + metrics=['single', 'complete', 'average', + 'weighted', 'centroid', 'median', + 'ward']): + """! Computes a hierarchical clustering for a (list of) metric(s) and + provides the maximum cophenetic distance(s) as well as a score for the + clustering (see @method evaluate_clustering(...)). + + @param corr_mat correlation matrix between the features / data set items + to be clustered hierarchically. + @param corr_pairwdist Computed pairwise distance between the features / data + set items. + @param metric Metric or list of metrics to compute the hierarchical + clustering. + + @return (List of) hierarchical clustering(s), maximum cophenetic distance(s) + and scores of the hierarchical clustering. + """ + # NOTE: if changing metric, pay attention to linkage methods; + # 'centroid', 'median', and 'ward' are correctly defined only if + # Euclidean pairwise metric is used. + # Based on the distances, we compute an hierarchical clustering for + # different metrics + max_coph_corr = 0 + scores = dict() + # allow single entry + if not isinstance(metrics, list): + metrics = [metrics] + # iterate over list + for metric in metrics: + cluster_hierarch = hierarchy.linkage(corr_pairwdist, method=metric) + # compute cophentic correlation distance + coph_corr, coph_dists = hierarchy.cophenet( + cluster_hierarch, pdist(corr_mat)) + scores[metric] = coph_corr + if coph_corr > max_coph_corr: + max_coph_corr = coph_corr + max_metric = metric + max_coph_dist = coph_dists + + cluster_hierarch = hierarchy.linkage(corr_pairwdist, method=max_metric) + + print( + "Cophentic correlation distance for metric " + max_metric + ": " + + str(max_coph_corr)) + + return cluster_hierarch, max_coph_dist, scores + + +def flatten_hierarch_clustering(corr_mat, cluster_hierarch, weights): + """! Flattens a hierarchical clustering for a (list of) maximum cophenetic + distance(s) in the flat clusters and evaluates the resulting clustering with + respect to the corresponding correlation matrix. + + @param corr_mat correlation matrix between the features / data set items + clustered hierarchically. + @param cluster_hierarch hierarchical clustering of given features / data + set items. + @param weigths Maximum cophenetic distance or list of maximum cophenetic + distances to compute the flat clustering(s). + + @return flat clustering(s) according to the (list of) maximum distance(s). + """ + + # all indices in npis_corr from 0 to n-1 + npi_indices_all = set(range(corr_mat.shape[0])) + npi_idx_to_cluster_idx_list = [] + # allow single entries + if not isinstance(weights, list): + weights = [weights] + # iterate over weights + for weight in weights: + # use the given weight to flatten the dendrogram + npi_idx_to_cluster_idx = hierarchy.fcluster( + cluster_hierarch, weight, criterion='distance') + + # evaluate clustering + evaluate_clustering(corr_mat, npi_idx_to_cluster_idx, npi_indices_all) + + # append new npi_idx to cluster_idx assignment to list of assignments + npi_idx_to_cluster_idx_list.append(npi_idx_to_cluster_idx) + + return npi_idx_to_cluster_idx_list + + +def print_manual_download(filename, url): + + print( + 'This script needs manual downloading of files. Please register' + ' at corona-datenplatform.com and download ' + filename + ' from ' + url + + '. Then move it to a folder named npi_raw in this directory.') + + +def transform_npi_data(fine_resolution=2, + read_data=dd.defaultDict['read_data'], + file_format=dd.defaultDict['file_format'], + out_folder=dd.defaultDict['out_folder'], + start_date=dd.defaultDict['start_date'], + end_date=dd.defaultDict['end_date'], + make_plot=dd.defaultDict['make_plot'], + ): + """! Loads a certain resolution of recorded NPI data from + the Corona Datenplattform and transforms it according to the + arguments given. + + For full functionality, please manually download + - kr_massnahmen_unterkategorien.csv + - datensatzbeschreibung_massnahmen.xlsx + from https://www.corona-datenplattform.de/dataset/massnahmen_unterkategorien_kreise + and + - kr_massnahmen_oberkategorien.csv + from https://www.corona-datenplattform.de/dataset/massnahmen_oberkategorien_kreise + and move it to a folder named *npi_raw* in the current directory. + + @param fine_resolution 2 [Default] or 0 or 1. Defines which categories + are considered. + If '2' is set, all the subcategories (~1200) are considered. + If '1' is set, all incidence levels of subcategories are merged and + ~200 NPIs are considered. + If '0' is chosen only the main, summarizing categories (~20) are used. + @param file_format File format which is used for writing the data. + Default defined in defaultDict. + @param out_folder Path to folder where data is written in folder + out_folder/Germany. + @param start_date [Default = '', taken from read data] Start date + of stored data frames. + @param end_date [Default = '', taken from read data] End date of + stored data frames. + @param make_plot False [Default] or True. Defines if plots are + generated with matplotlib. + @param moving_average 0 [Default] or Number>0. Defines the number of + days for which a centered moving average is computed. + """ + + directory = out_folder + directory = os.path.join(directory, 'Germany/') + gd.check_dir(directory) + + if not read_data: + + if fine_resolution > 0: + try: + df_npis_old = pd.read_csv( + 'npi_raw/kr_massnahmen_unterkategorien.csv', sep=';') + except FileNotFoundError: + print_manual_download( + 'kr_massnahmen_unterkategorien.csv', + 'https://www.corona-datenplattform.de/dataset/massnahmen_unterkategorien_kreise') + raise FileNotFoundError + df_npis_old.rename(dd.GerEng, axis=1, inplace=True) + + try: + df_npis_desc = pd.read_excel( + 'npi_raw/datensatzbeschreibung_massnahmen.xlsx', + sheet_name=2) + except FileNotFoundError: + print_manual_download( + 'datensatzbeschreibung_massnahmen.xlsx', + 'https://www.corona-datenplattform.de/dataset/massnahmen_unterkategorien_kreise') + raise FileNotFoundError + + # check on Krankenhaeuser and Pflege + test_codes = ['M23_010', 'M23_020', 'M23_030', 'M23_040', + 'M23_050', 'M23_060', 'M24_010', 'M24_020', + 'M24_030', 'M24_040', 'M24_050', 'M24_060'] + for tcode in test_codes: + for i in [''] + ["_" + str(i) for i in range(1, 6)]: + if(df_npis_old[df_npis_old.NPI_code == tcode+i].iloc[:, 6:].max().max() > 0): + print(tcode+i + " used.") + # end check + + else: + try: + df_npis_old = pd.read_csv( + 'npi_raw/kr_massnahmen_oberkategorien.csv') + except FileNotFoundError: + print_manual_download( + 'datensatzbeschreibung_massnahmen.xlsx', + 'https://www.corona-datenplattform.de/dataset/massnahmen_oberkategorien_kreise') + raise FileNotFoundError + df_npis_old.rename(dd.GerEng, axis=1, inplace=True) + + try: + df_npis_desc = pd.read_excel( + 'npi_raw/datensatzbeschreibung_massnahmen.xlsx', + sheet_name=3) + except FileNotFoundError: + print_manual_download( + 'datensatzbeschreibung_massnahmen.xlsx', + 'https://www.corona-datenplattform.de/dataset/massnahmen_oberkategorien_kreise') + raise FileNotFoundError + + else: # read formatted file + + if fine_resolution > 0: + if fine_resolution == 1: + filename = 'germany_counties_npi_subcat_incgrouped' + else: + filename = 'germany_counties_npi_subcat' + else: + filename = 'germany_counties_npi_maincat' + df_npis = pd.read_json(directory + filename + ".json") + + # read data frame of variable names and descriptions + try: + if fine_resolution > 0: + df_npis_desc = pd.read_excel( + 'npi_raw/datensatzbeschreibung_massnahmen.xlsx', sheet_name=2) + else: + df_npis_desc = pd.read_excel( + 'npi_raw/datensatzbeschreibung_massnahmen.xlsx', sheet_name=3) + except FileNotFoundError: + print_manual_download( + 'datensatzbeschreibung_massnahmen.xlsx', + 'https://www.corona-datenplattform.de/dataset/massnahmen_unterkategorien_kreise') + raise FileNotFoundError + + # get existing codes that are used (in df_npis_old M22-M24 are empty) + npi_codes_prior = df_npis_desc.Variablenname + + # correct differences in codes between data sheet and explanation sheet + if fine_resolution > 0: + # correct M04_N codes to M_04_M_N, N in {1,...,5}, M in {120,110,100,130,140} + for i in range(1, 6): + npi_codes_prior[npi_codes_prior == 'M04_'+str(i)] = ['M04_120_'+str( + i), 'M04_110_'+str(i), 'M04_100_'+str(i), 'M04_130_'+str(i), 'M04_140_'+str(i)] + + # correct M05_N codes to M_05_M_N, N in {1,...,5}, M in {130,150,120,140,110,100,160} + for i in range(1, 6): + npi_codes_prior[npi_codes_prior == 'M05_'+str(i)] = ['M05_130_'+str(i), 'M05_150_'+str( + i), 'M05_120_'+str(i), 'M05_140_'+str(i), 'M05_110_'+str(i), 'M05_100_'+str(i), 'M05_160_'+str(i)] + + # correct 'M16_200_2' to missing 'M16_100_2' + npi_codes_prior[npi_codes_prior == 'M16_200_2'] = 'M16_100_2' + + # check for missing codes + if not read_data: + npi_codes_prior_data = df_npis_old[dd.EngEng['npiCode']].unique() + else: + npi_codes_prior_data = list(df_npis.columns[2:]) + + missing_codes = list(set(npi_codes_prior).difference( + npi_codes_prior_data)) + if len(missing_codes) > 0: + # if incidence is grouped, only search for grouping codes without + # having a detailed "_DETAIL" naming as of MCODE_NUMBER_DETAIL + if fine_resolution == 1: + missing_grouped_codes = [] + for mcode in missing_codes: + if len(mcode.split('_')) < 2: + missing_grouped_codes.append(mcode) + if len(missing_grouped_codes) > 0: # only MCODE_NUMBER codes + print('Missing NPI codes: ' + str(missing_grouped_codes)) + else: + print('Missing NPI codes: ' + str(missing_codes)) + + # we dont have any explanations on these codes, so drop the rows. + # codes_dropped = list(set(npi_codes_prior_data).difference(npi_codes_prior)) + # df_npis_old = df_npis_old[~df_npis_old[dd.EngEng['npiCode']].isin(codes_dropped)].reset_index() + + npi_codes = sorted(npi_codes_prior) + npi_codes_sorting = np.argsort(npi_codes_prior) + if fine_resolution > 0: + # for subcategories, description is in "Beschreibung" column; "Variable" + # column is repeated after ";" sign (except for 6 first rows where some + # error is probably present) + npi_desc = list(df_npis_desc["Beschreibung"][npi_codes_sorting]) + + # stupid test start + dummy_a = list(df_npis_desc["Variable"][npi_codes_sorting]) + dummy_b = df_npis_desc["Beschreibung"][npi_codes_sorting] + dummy_c = [str(x).split("; ")[1] for x in dummy_b] + errors = [] + for i in range(len(dummy_a)): + if not dummy_a[i] == dummy_c[i]: + errors.append(i) + if not errors == [0, 1, 2, 3, 4, 5]: + print("Additional error in naming...") + # stupid test end + + # correct error (mainly done for plotting reasons, otherwise naming + # column not strictly necessary) + for i in range(6): + npi_desc[i] = npi_desc[i].split("; ")[0] + "; " + dummy_a[i] + + else: + # extract variable names for main categories + npi_desc = list(df_npis_desc["Variable"][npi_codes_sorting]) + + # group incidence NPIs to remove product space of + # NPI x from_inc(incid_not_matter, 0, 10, 35, 50, 100) + if fine_resolution == 1: + npi_codes_noincind_dict = dict() + major_code = npi_codes[0] + for code in npi_codes: + if major_code in code: + npi_codes_noincind_dict[code] = major_code + else: + major_code = code + npi_codes_noincind_dict[code] = code + + # get unique list + npi_codes_incgrouped_list = sorted( + set(npi_codes_noincind_dict.values())) + + if not read_data: + # replace more detailed code names X_Y with major code X + df_npis_old[dd.EngEng['npiCode']] = df_npis_old[dd.EngEng[ + 'npiCode']].replace(npi_codes_noincind_dict) + + if fine_resolution == 1: + npi_codes_considered = npi_codes_incgrouped_list + else: + npi_codes_considered = npi_codes + + # transform data from original format to desired format + if not read_data: + # get county ids + unique_geo_entities = geoger.get_county_ids() + + start_npi_cols = list( + df_npis_old.columns).index( + dd.EngEng['npiCode']) + 1 + + # create new data frame for all NPIs given in the columns, resolved by + # county and day + df_npis = pd.DataFrame( + columns=[dd.EngEng['date']] + [dd.EngEng['idCounty']] + + npi_codes_considered) + # convert NPI data from object to int such that correlations can be + # computed + df_npis = df_npis.astype(dict( + zip( + [dd.EngEng['date']] + [dd.EngEng['idCounty']] + + npi_codes_considered, ['str', 'int'] + + ['int' for i in npi_codes_considered]))) + + # store string dates 'dYYYYMMDD' in list before parsing + str_dates = list(df_npis_old.iloc[:, start_npi_cols:].columns) + # convert string dates into other format + dates_new = [datetime.strptime(old_date, "d%Y%m%d") + for old_date in str_dates] + + date_diff = [ + (dates_new[i + 1] - dates_new[i]).days + for i in range(len(dates_new) - 1)] + date_diff_idx = np.where(np.array(date_diff) > 1)[0] + if max(date_diff) > 1: + print("Error. Dates missing in data frame:") + for i in date_diff_idx: + print( + "\t - From " + str(dates_new[i] + timedelta(1)) + " until " + + str(dates_new[i] + timedelta(date_diff[i] - 1))) + + # get RKI infectious numbers to find dates where incidence-dependent + # NPIs were active + if fine_resolution > 0: + df_infec_rki = pd.read_json( + 'data/pydata/Germany/all_county_all_dates_repdate_rki.json') + + # iterate over countyIDs + counters = np.zeros(3) + for countyID in [3101]: # unique_geo_entities: + + # get county-local data frame + start_time = time.perf_counter() + df_local_old = df_npis_old[df_npis_old[dd.EngEng['idCounty']] + == countyID].copy() + + if fine_resolution == 1: + # group by incidence (former codes X1_Y, X1_Z where transformed + # to X1, X2) and take max value + df_local_old = df_local_old.groupby(dd.EngEng['npiCode']).max() + # insert aggregated NPI code column + df_local_old.insert( + loc=start_npi_cols - 1, column=dd.EngEng['npiCode'], + value=df_local_old.index) + + # remove potential rows that do not have any explanation in + # additional explanation sheet + # (e.g., M22-M24 main categories seem to be placeholders) + npi_rows = [i in npi_codes_considered + for i in df_local_old[dd.EngEng['npiCode']]] + + # get list of NPI codes, ordered as the rows in the current data frame + npi_codes_ordered_as_rows = df_local_old[dd.EngEng['npiCode']][ + npi_rows].to_list() + + # get indices of rows for the NPI codes as in the sorted npi_codes list + npi_code_rows_to_sorted = [ + npi_codes_ordered_as_rows.index(i) for i in + npi_codes_considered] + + # access NPI values matrix and store it as integers + npi_vals = df_local_old.iloc[npi_rows, start_npi_cols:].astype(int) + + # create columns for date, county ID and NPI code + df_local_new = pd.DataFrame( + columns=[dd.EngEng['date']] + [dd.EngEng['idCounty']] + + npi_codes_considered) + + # fill in NPI values by transposing from columns to rows + df_local_new[dd.EngEng['date']] = dates_new + df_local_new[dd.EngEng['idCounty']] = countyID + # possible resorting of rows such that they are sorted according to + # a literal sorting of the code strings + df_local_new[npi_codes_considered] = np.transpose( + npi_vals.iloc[npi_code_rows_to_sorted, :].values) + + counters[0] += time.perf_counter()-start_time + + start_time = time.perf_counter() + + # replace -99 ("not implemented anymore") by 0 ("not implemented") + df_local_new[npi_codes_considered] = df_local_new[npi_codes_considered].replace( + -99, 0) + # replace 2,3,4,5 ("implemented by ...") by 1 ("implemented") + df_local_new[npi_codes_considered] = df_local_new[npi_codes_considered].replace([ + 2, 3, 4, 5], 1) + + counters[1] += time.perf_counter()-start_time + + start_time = time.perf_counter() + df_npis = df_npis.append(df_local_new.copy()) + counters[2] += time.perf_counter()-start_time + + # kita + # figcode = ["M03_0" + str(i)+str(j) for i in range(10,70,10) for j in [''] + ['_'+str(k) for k in range(1,6)]] + # # school + # figcode = ["M02a_0" + str(i)+str(j) for i in [10,20,30,31,32,33,34,35,36] for j in [''] + ['_'+str(k) for k in range(1,6)]] + # for bb in figcode: + # dates_mentioned = np.array(dates_new)[list(np.where(df_npis.loc[:,bb]>0)[0])] + # print('') + # customPlot.plotList(df_npis.loc[:,"Date"], [df_npis.loc[:,bb] for bb in figcode], legend=figcode, title='asd', xlabel='asd', ylabel='ad', fig_name='asd') + + print(counters) + + # reset index and drop old index column + df_npis.reset_index(inplace=True) + try: + df_npis = df_npis.drop(columns='index') + except: + pass + try: + df_npis = df_npis.drop(columns='level_0') + except: + pass + + print( + "Time needed: " + str(counters[0]) + ", " + str(counters[1]) + ", " + + str(counters[2]) + " sec") + + #### start validation #### + if fine_resolution > 1: + # Cologne for M01a_010 and all dates (no changes) + dummy_old = df_npis_old[(df_npis_old.ID_County == 5315) & ( + df_npis_old.NPI_code == 'M01a_010')].values[0][start_npi_cols:] + dummy_new = df_npis.loc[df_npis.ID_County == + 5315, 'M01a_010'].values + print(abs(dummy_old-dummy_new).sum() == 0) + + # Flensburg for M05_120 and all dates ('2's become '1's) + dummy_old = df_npis_old[(df_npis_old.ID_County == 1001) & ( + df_npis_old.NPI_code == 'M05_120')].values[0][start_npi_cols:] + dummy_new = df_npis.loc[df_npis.ID_County == + 1001, 'M05_120'].values + print(abs(dummy_old-dummy_new).sum() == 5) + + # Munich for M01a_010_4 and all dates (-99 becomes 0, 0 stays 0) + dummy_old = df_npis_old[(df_npis_old.ID_County == 9162) & ( + df_npis_old.NPI_code == 'M01a_010_4')].values[0][start_npi_cols:] + dummy_new = df_npis.loc[df_npis.ID_County == + 9162, 'M01a_010_4'].values + print(abs(dummy_old-dummy_new).sum() == 422*99) + + # Weimar for M12_030_3 and all dates (-99 becomes 0, 1 stays 1, 0 stays 0) + dummy_old = df_npis_old[(df_npis_old.ID_County == 16071) & ( + df_npis_old.NPI_code == 'M12_030_3')].values[0][start_npi_cols:] + dummy_new = df_npis.loc[df_npis.ID_County == + 16071, 'M12_030_3'].values + print(abs(dummy_old-dummy_new).sum() == 422*99) + + # Berlin for M01b_020 and all dates (2 becomes 1, 1 stays 1, 0 stays 0) + dummy_old = df_npis_old[(df_npis_old.ID_County == 11000) & ( + df_npis_old.NPI_code == 'M01b_020')].values[0][start_npi_cols:] + dummy_new = df_npis.loc[df_npis.ID_County == + 11000, 'M01b_020'].values + print(abs(dummy_old-dummy_new).sum() == 82) + + # Segeberg for M02b_035 and all dates (2 -> 1, 3 -> 1, 5 -> 1) + dummy_old = df_npis_old[(df_npis_old.ID_County == 1060) & ( + df_npis_old.NPI_code == 'M02b_035')].values[0][start_npi_cols:] + dummy_new = df_npis.loc[df_npis.ID_County == + 1060, 'M02b_035'].values + print(abs(dummy_old-dummy_new).sum() == 151+2*53+4*22) + + # Steinfurt for M16_050 and all dates (4 -> 1, ...) + dummy_old = df_npis_old[(df_npis_old.ID_County == 5566) & ( + df_npis_old.NPI_code == 'M16_050')].values[0][start_npi_cols:] + dummy_new = df_npis.loc[df_npis.ID_County == + 5566, 'M16_050'].values + print(abs(dummy_old-dummy_new).sum() == 32+2*20+3*22) + elif fine_resolution == 2: + # Cologne for M01a_010 and all dates (no changes) + dummy_old = df_npis_old[(df_npis_old.ID_County == 5315) & ( + df_npis_old.NPI_code == 'M01a_010')].values[0][start_npi_cols:] + for subcode in range(1, 6): # add subcode values + dummy_old = np.maximum(dummy_old, df_npis_old[(df_npis_old.ID_County == 5315) & ( + df_npis_old.NPI_code == 'M01a_010')].values[subcode][start_npi_cols:]) + dummy_new = df_npis.loc[df_npis.ID_County == + 5315, 'M01a_010'].values + print(abs(dummy_old-dummy_new).sum() == 0) + + # Flensburg for M05_120 and all dates ('2's become '1's) + dummy_old = df_npis_old[(df_npis_old.ID_County == 1001) & ( + df_npis_old.NPI_code == 'M05_120')].values[0][start_npi_cols:] + for subcode in range(1, 6): # add subcode values + dummy_old = np.maximum(dummy_old, df_npis_old[(df_npis_old.ID_County == 1001) & ( + df_npis_old.NPI_code == 'M05_120')].values[subcode][start_npi_cols:]) + dummy_new = df_npis.loc[df_npis.ID_County == + 1001, 'M05_120'].values + print(abs(dummy_old-dummy_new).sum() == 5) + + # Munich for M01a_010 and all dates (-99 becomes 0, 0 stays 0) + dummy_old = df_npis_old[(df_npis_old.ID_County == 9162) & ( + df_npis_old.NPI_code == 'M01a_010')].values[0][start_npi_cols:] + for subcode in range(1, 6): # add subcode values + dummy_old = np.maximum(dummy_old, df_npis_old[(df_npis_old.ID_County == 9162) & ( + df_npis_old.NPI_code == 'M01a_010')].values[subcode][start_npi_cols:]) + dummy_new = df_npis.loc[df_npis.ID_County == + 9162, 'M01a_010'].values + print(abs(dummy_old-dummy_new).sum() == 0) + + # Weimar for M12_030_3 and all dates (-99 becomes 0, 1 stays 1, 0 stays 0) + dummy_old = df_npis_old[(df_npis_old.ID_County == 16071) & ( + df_npis_old.NPI_code == 'M12_030')].values[0][start_npi_cols:] + for subcode in range(1, 6): # add subcode values + dummy_old = np.maximum(dummy_old, df_npis_old[(df_npis_old.ID_County == 16071) & ( + df_npis_old.NPI_code == 'M12_030')].values[subcode][start_npi_cols:]) + dummy_new = df_npis.loc[df_npis.ID_County == + 16071, 'M12_030'].values + print(abs(dummy_old-dummy_new).sum() == 19) + + # Berlin for M01b_020 and all dates (2 becomes 1, 1 stays 1, 0 stays 0) + dummy_old = df_npis_old[(df_npis_old.ID_County == 11000) & ( + df_npis_old.NPI_code == 'M01b_020')].values[0][start_npi_cols:] + for subcode in range(1, 6): # add subcode values + dummy_old = np.maximum(dummy_old, df_npis_old[(df_npis_old.ID_County == 11000) & ( + df_npis_old.NPI_code == 'M01b_020')].values[subcode][start_npi_cols:]) + dummy_new = df_npis.loc[df_npis.ID_County == + 11000, 'M01b_020'].values + print(abs(dummy_old-dummy_new).sum() == 82) + + # Segeberg for M02b_035 and all dates (2 -> 1, 3 -> 1, 5 -> 1) + dummy_old = df_npis_old[(df_npis_old.ID_County == 1060) & ( + df_npis_old.NPI_code == 'M02b_035')].values[0][start_npi_cols:] + for subcode in range(1, 6): # add subcode values + dummy_old = np.maximum(dummy_old, df_npis_old[(df_npis_old.ID_County == 1060) & ( + df_npis_old.NPI_code == 'M02b_035')].values[subcode][start_npi_cols:]) + dummy_new = df_npis.loc[df_npis.ID_County == + 1060, 'M02b_035'].values + print(abs(dummy_old-dummy_new).sum() == 151+2*53+4*22) + + # Steinfurt for M16_050 and all dates (4 -> 1, ...) + dummy_old = df_npis_old[(df_npis_old.ID_County == 5566) & ( + df_npis_old.NPI_code == 'M16_050')].values[0][start_npi_cols:] + for subcode in range(1, 6): # add subcode values + dummy_old = np.maximum(dummy_old, df_npis_old[(df_npis_old.ID_County == 5566) & ( + df_npis_old.NPI_code == 'M16_050')].values[subcode][start_npi_cols:]) + dummy_new = df_npis.loc[df_npis.ID_County == + 5566, 'M16_050'].values + print(abs(dummy_old-dummy_new).sum() == 32+2*20+3*22) + #### end validation #### + + if fine_resolution > 0: + if fine_resolution == 1: + filename = 'germany_counties_npi_subcat_incgrouped' + else: + filename = 'germany_counties_npi_subcat' + else: + filename = 'germany_counties_npi_maincat' + gd.write_dataframe(df_npis, directory, filename, file_format) + + # stupid validation + # df_validation = pd.read_json(directory + filename + ".json") + # if len( + # np.where( + # df_validation.iloc[:, start_npi_cols - 1:] != df_npis.iloc + # [:, start_npi_cols - 1:])[0]) > 0: + # print('Error in file writing/reading') + + # get code levels (main/subcodes) and position of main codes + # code_level = [i.count('_') for i in npi_codes] + # main_code_pos = [i for i in range(len(code_level)) if code_level[i] == 1] + + # check if any other integer than 0: not implemented or 1: implemented is + # used (maybe to specify the kind of implementation) + if len(np.where(df_npis[npi_codes_considered] > 1)[0]) > 0: + + print("Info: Please ensure that NPI information is only boolean.") + + else: + # sum over different NPIs and plot share of countires implementing + # these NPIs versus counties without corresponding actions + df_npis_aggregated = df_npis.groupby( + dd.EngEng['date']).agg( + {i: sum for i in npi_codes_considered}).copy() + npis_total_sum = df_npis_aggregated.sum() + + npi_codes_empty = list(np.array(npi_codes_considered)[ + np.where(npis_total_sum == 0)[0]]) + + npi_unused_indices_all = [] + npi_used_indices_all = [] + npi_unused_indices = [] + npi_used_indices = [] + for i in range(len(npi_codes_considered)): + if npi_codes_considered[i] in npi_codes_empty: + npi_unused_indices.append(i) + npi_unused_indices_all.append( + npi_codes.index(npi_codes_considered[i])) + else: + npi_used_indices.append(i) + npi_used_indices_all.append( + npi_codes.index(npi_codes_considered[i])) + + npis_unused = np.array(npi_desc)[npi_unused_indices_all] + npis_used = np.array(npi_desc)[npi_used_indices_all] + npi_codes_used = list(np.array(npi_codes_considered)[npi_used_indices]) + npi_codes_unused = list( + np.array(npi_codes_considered)[npi_unused_indices]) + + # open file to write unused categories + if fine_resolution > 0: + if fine_resolution == 1: + filename = 'unused_subcats_incgrouped.txt' + else: + filename = 'unused_subcats.txt' + else: + filename = 'unused_maincats.txt' + file_npi = open(directory + filename, 'w') + # Writing unused NPIs + for i in range(len(npis_unused)): + file_npi.write(npi_codes_unused[i] + ": " + npis_unused[i]) + file_npi.write("\n") + # Closing file + file_npi.close() + + # open file to write unused categories + if fine_resolution > 0: + if fine_resolution == 1: + filename = 'used_subcats_incgrouped.txt' + else: + filename = 'used_subcats.txt' + else: + filename = 'used_maincats.txt' + file_npi = open(directory + filename, 'w') + # Writing unused NPIs + for i in range(len(npis_used)): + file_npi.write(npi_codes_used[i] + ": " + npis_used[i]) + file_npi.write("\n") + # Closing file + file_npi.close() + + df_npis_used = df_npis[[dd.EngEng['date'], + dd.EngEng['idCounty']] + npi_codes_used].copy() + if fine_resolution > 0: + if fine_resolution == 1: + filename = 'germany_counties_npi_subcat_used_incgrouped' + else: + filename = 'germany_counties_npi_subcat_used' + else: + filename = 'germany_counties_npi_maincat_used' + gd.write_dataframe(df_npis_used, directory, filename, file_format) + + # compute correlations + npis_corr = df_npis_used.iloc[:, 2:].corr().values + # plot log-colored correlations + plt.imshow(abs(npis_corr), cmap='gray_r') + # plot histogram + plt.figure() + plt.hist(npis_corr.flatten(), bins=50) + plt.title("Correlation histogram", fontsize=18) + plt.xlabel("Correlation", fontsize=12) + plt.ylabel("Number of values", fontsize=12) + + # We understand the rows of npis_corr, the correlations of one NPI + # to the others as one node in the #NPIs-used-dimensional space. + # We compute the pairwise distances of these nodes. Then, nodes with + # similar correlations towards all other nodes exhibit small distances + corr_pairwdist = hierarchy.distance.pdist( + npis_corr, metric='euclidean') + + # compute hierarchical clustering (via best-suited metric) + compare_metrics = True + if compare_metrics: + # centroid + metric = 'centroid' + cluster_hierarch, coph_dist, scores = compute_hierarch_clustering( + abs(npis_corr), + corr_pairwdist, + metric) + # # plot dendrogram + plt.figure() + plt.title(metric) + hierarchy.dendrogram(cluster_hierarch) + plt.show() + max_coph_dist = coph_dist.max() + flatten_hierarch_clustering( + abs(npis_corr), cluster_hierarch, + [wg * max_coph_dist + for wg in [0.6, 0.625, 0.65, 0.675, 0.7, 0.725, 0.75]]) + # ward + metric = 'ward' + cluster_hierarch, coph_dist, scores = compute_hierarch_clustering( + npis_corr, + corr_pairwdist, + metric) + # # plot dendrogram + # plt.figure() + # plt.title(metric) + # hierarchy.dendrogram(cluster_hierarch) + # plt.show() + max_coph_dist = coph_dist.max() + flatten_hierarch_clustering( + abs(npis_corr), cluster_hierarch, + [wg * max_coph_dist for wg in [0.1, 0.125, 0.15, 0.175, 0.2]]) + # average + metric = 'average' + cluster_hierarch, coph_dist, scores = compute_hierarch_clustering( + abs(npis_corr), + corr_pairwdist, + metric) + # # plot dendrogram + # plt.figure() + # plt.title(metric) + # hierarchy.dendrogram(cluster_hierarch) + # plt.show() + max_coph_dist = coph_dist.max() + flatten_hierarch_clustering( + npis_corr, cluster_hierarch, + [wg * max_coph_dist + for wg in [0.475, 0.5, 0.525, 0.55, 0.575, 0.6, 0.625, 0.65]]) + + metric = 'centroid' + cluster_hierarch, coph_dist, scores = compute_hierarch_clustering( + npis_corr, + corr_pairwdist, + metric) + # # plot dendrogram + # plt.figure() + # plt.title(metric) + # hierarchy.dendrogram(cluster_hierarch) + # plt.show() + max_coph_dist = coph_dist.max() + npi_idx_to_cluster_idx = flatten_hierarch_clustering( + npis_corr, cluster_hierarch, + [wg * max_coph_dist + for wg in [0.65]]) + + cluster_dict = dict() + cluster_codes = [[] for i in range(npi_idx_to_cluster_idx[0].max()+1)] + cluster_desc = [[] for i in range(npi_idx_to_cluster_idx[0].max()+1)] + for i in range(len(npi_idx_to_cluster_idx[0])): + cluster_dict[npi_codes_used[i] + ] = "CM_" + str(npi_idx_to_cluster_idx[0][i]).zfill(3) + cluster_codes[npi_idx_to_cluster_idx[0] + [i]].append(npi_codes_used[i]) + cluster_desc[npi_idx_to_cluster_idx[0] + [i]].append(str(npis_used[i])) + + # create clustered dataframe + df_npis_clustered = df_npis[[ + dd.EngEng['date'], dd.EngEng['idCounty']]].copy() + + for i in range(len(cluster_codes)): + df_npis_clustered["CM_" + str(i).zfill(3) + ] = df_npis[cluster_codes[i]].max(axis=1).copy() + + npis_corr_cluster = df_npis_clustered.corr() + # npis_corr_cluster[abs(npis_corr_cluster)<0.25] = 0 + plt.imshow(abs(npis_corr_cluster), cmap='gray_r') + plt.title('Absolute correlation>0.25 of clustered NPIs') + plt.xlabel('NPI cluster') + plt.ylabel('NPI cluster') + plt.colorbar() + + # open file to write unused categories + if fine_resolution > 0: + if fine_resolution == 1: + filename = 'clusters_subcats_incgrouped.txt' + else: + filename = 'clusters_subcats.txt' + else: + filename = 'clusters_maincats.txt' + file_npi = open(directory + filename, 'w') + # Writing unused NPIs + for i in range(len(cluster_codes)): + file_npi.write("Cluster " + str(i) + "\n") + for j in range(len(cluster_codes[i])): + file_npi.write(cluster_codes[i][j] + ": " + cluster_desc[i][j]) + file_npi.write("\n") + file_npi.write("\n") + # Closing file + file_npi.close() + + npi_idx_new = np.argsort(npi_idx_to_cluster_idx[0]) + npis_corr_reorder = npis_corr[npi_idx_new, :][:, npi_idx_new] + + plt.imshow(abs(npis_corr_reorder), cmap='gray_r') + plt.colorbar() + + # npi_indices_all = set(range(npis_corr.shape[0])) + # for i in [40]:#[10, 20, 40, 80, 160]: + # kmeans_npis = KMeans(n_clusters=i).fit(df_npis_used.iloc[:,2:].T) + # evaluate_clustering(npis_corr, kmeans_npis.labels_, npi_indices_all) + + # for i in [40]:#[10, 20, 40, 80, 160]: + # kmeans_corr = KMeans(n_clusters=i).fit(npis_corr) + # evaluate_clustering(npis_corr, kmeans_corr.labels_, npi_indices_all) + + # corr_threshold = 0.5 + # corr_indices_threshold = np.where(npis_corr > corr_threshold) + # npis_corr_threshold = np.zeros(npis_corr.shape) + # npis_corr_threshold[corr_indices_threshold] = npis_corr[corr_indices_threshold] + # plt.imshow(npis_corr_threshold, cmap='gray_r') + + # plot share of counties that implement the main categories + if make_plot: + # plot four different subsets of curves for better distinction + j = 0 + if fine_resolution > 0: + num_images = 15 + else: + num_images = 1 + for i in [ + slice( + int(len(npi_codes_used) / num_images) * i, + min( + int(len(npi_codes_used) / num_images) * + (i + 1), + len(npis_used))) for i in range( + num_images + 1)]: + customPlot.plotList(df_npis_aggregated.index, + [df_npis_aggregated[code] + for code in npi_codes_used[i]], + npis_used[i], + 'Counties implementing NPI main categories', + 'Date', 'Number', "Counties_NPI_main_" + + str(j) + "_of_"+str(num_images)) + j += 1 + + +def main(): + """! Main program entry.""" + + # arg_dict = gd.cli("testing") + transform_npi_data(fine_resolution=1, read_data=False, make_plot=True) + + +if __name__ == "__main__": + + main() From 3115bf3a57d0c79f70bad116c603879701f33553 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Mon, 28 Mar 2022 18:25:47 +0200 Subject: [PATCH 002/104] directory for raw data changed; new data separated by comma --- .../memilio/epidata/transformNPIData.py | 26 ++++++++++++------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index 490b08a195..eb8befeeb5 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -183,7 +183,7 @@ def print_manual_download(filename, url): print( 'This script needs manual downloading of files. Please register' ' at corona-datenplatform.com and download ' + filename + ' from ' + url + - '. Then move it to a folder named npi_raw in this directory.') + '. Then move it to a folder named raw_data in this directory.') def transform_npi_data(fine_resolution=2, @@ -205,7 +205,7 @@ def transform_npi_data(fine_resolution=2, and - kr_massnahmen_oberkategorien.csv from https://www.corona-datenplattform.de/dataset/massnahmen_oberkategorien_kreise - and move it to a folder named *npi_raw* in the current directory. + and move it to the *directory*-path mentioned in the beginning of the function. @param fine_resolution 2 [Default] or 0 or 1. Defines which categories are considered. @@ -236,7 +236,9 @@ def transform_npi_data(fine_resolution=2, if fine_resolution > 0: try: df_npis_old = pd.read_csv( - 'npi_raw/kr_massnahmen_unterkategorien.csv', sep=';') + os.path.join( + directory, 'kr_massnahmen_unterkategorien.csv'), + sep=',') except FileNotFoundError: print_manual_download( 'kr_massnahmen_unterkategorien.csv', @@ -246,7 +248,8 @@ def transform_npi_data(fine_resolution=2, try: df_npis_desc = pd.read_excel( - 'npi_raw/datensatzbeschreibung_massnahmen.xlsx', + os.path.join( + directory, 'datensatzbeschreibung_massnahmen.xlsx'), sheet_name=2) except FileNotFoundError: print_manual_download( @@ -266,8 +269,8 @@ def transform_npi_data(fine_resolution=2, else: try: - df_npis_old = pd.read_csv( - 'npi_raw/kr_massnahmen_oberkategorien.csv') + df_npis_old = pd.read_csv(os.path.join( + directory, 'kr_massnahmen_oberkategorien.csv')) except FileNotFoundError: print_manual_download( 'datensatzbeschreibung_massnahmen.xlsx', @@ -277,7 +280,8 @@ def transform_npi_data(fine_resolution=2, try: df_npis_desc = pd.read_excel( - 'npi_raw/datensatzbeschreibung_massnahmen.xlsx', + os.path.join( + directory, 'datensatzbeschreibung_massnahmen.xlsx'), sheet_name=3) except FileNotFoundError: print_manual_download( @@ -300,10 +304,14 @@ def transform_npi_data(fine_resolution=2, try: if fine_resolution > 0: df_npis_desc = pd.read_excel( - 'npi_raw/datensatzbeschreibung_massnahmen.xlsx', sheet_name=2) + os.path.join( + directory, 'datensatzbeschreibung_massnahmen.xlsx'), + sheet_name=2) else: df_npis_desc = pd.read_excel( - 'npi_raw/datensatzbeschreibung_massnahmen.xlsx', sheet_name=3) + os.path.join( + directory, 'datensatzbeschreibung_massnahmen.xlsx'), + sheet_name=3) except FileNotFoundError: print_manual_download( 'datensatzbeschreibung_massnahmen.xlsx', From a485663e32a3c2ec0867e7c9ee68b2d9cf78d930 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Thu, 31 Mar 2022 18:55:00 +0200 Subject: [PATCH 003/104] auxiliary files were read twice --- .../memilio/epidata/transformNPIData.py | 28 +++---------------- 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index eb8befeeb5..c036806a70 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -246,18 +246,8 @@ def transform_npi_data(fine_resolution=2, raise FileNotFoundError df_npis_old.rename(dd.GerEng, axis=1, inplace=True) - try: - df_npis_desc = pd.read_excel( - os.path.join( - directory, 'datensatzbeschreibung_massnahmen.xlsx'), - sheet_name=2) - except FileNotFoundError: - print_manual_download( - 'datensatzbeschreibung_massnahmen.xlsx', - 'https://www.corona-datenplattform.de/dataset/massnahmen_unterkategorien_kreise') - raise FileNotFoundError - - # check on Krankenhaeuser and Pflege + # check if rows hospitals and geriatric care are still empty + # these fields have been empty so far and are thus not used test_codes = ['M23_010', 'M23_020', 'M23_030', 'M23_040', 'M23_050', 'M23_060', 'M24_010', 'M24_020', 'M24_030', 'M24_040', 'M24_050', 'M24_060'] @@ -267,7 +257,8 @@ def transform_npi_data(fine_resolution=2, print(tcode+i + " used.") # end check - else: + else: # read aggregated NPIs + try: df_npis_old = pd.read_csv(os.path.join( directory, 'kr_massnahmen_oberkategorien.csv')) @@ -278,17 +269,6 @@ def transform_npi_data(fine_resolution=2, raise FileNotFoundError df_npis_old.rename(dd.GerEng, axis=1, inplace=True) - try: - df_npis_desc = pd.read_excel( - os.path.join( - directory, 'datensatzbeschreibung_massnahmen.xlsx'), - sheet_name=3) - except FileNotFoundError: - print_manual_download( - 'datensatzbeschreibung_massnahmen.xlsx', - 'https://www.corona-datenplattform.de/dataset/massnahmen_oberkategorien_kreise') - raise FileNotFoundError - else: # read formatted file if fine_resolution > 0: From ed9e10e70ad735f4c97e8d36fccf46f60bd97da8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Tue, 5 Apr 2022 19:25:02 +0200 Subject: [PATCH 004/104] small redesign and WIP of evaluation against confirmed infections --- .../memilio/epidata/transformNPIData.py | 175 +++++++++++------- 1 file changed, 108 insertions(+), 67 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index c036806a70..9b94c99392 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -18,6 +18,7 @@ # limitations under the License. ############################################################################# from datetime import datetime, timedelta +import sys import time import os import pandas as pd @@ -304,7 +305,8 @@ def transform_npi_data(fine_resolution=2, # correct differences in codes between data sheet and explanation sheet if fine_resolution > 0: # correct M04_N codes to M_04_M_N, N in {1,...,5}, M in {120,110,100,130,140} - for i in range(1, 6): + # (M04_1, i.e. i=1, has been corrected in original file but not for i>1) + for i in range(2, 6): npi_codes_prior[npi_codes_prior == 'M04_'+str(i)] = ['M04_120_'+str( i), 'M04_110_'+str(i), 'M04_100_'+str(i), 'M04_130_'+str(i), 'M04_140_'+str(i)] @@ -333,23 +335,31 @@ def transform_npi_data(fine_resolution=2, if len(mcode.split('_')) < 2: missing_grouped_codes.append(mcode) if len(missing_grouped_codes) > 0: # only MCODE_NUMBER codes - print('Missing NPI codes: ' + str(missing_grouped_codes)) + sys.exit('Missing NPI codes: ' + + str(missing_grouped_codes)) else: - print('Missing NPI codes: ' + str(missing_codes)) - - # we dont have any explanations on these codes, so drop the rows. - # codes_dropped = list(set(npi_codes_prior_data).difference(npi_codes_prior)) - # df_npis_old = df_npis_old[~df_npis_old[dd.EngEng['npiCode']].isin(codes_dropped)].reset_index() - - npi_codes = sorted(npi_codes_prior) - npi_codes_sorting = np.argsort(npi_codes_prior) + sys.exit('Missing NPI codes: ' + str(missing_codes)) + + # we dont have any explanations from "datensatzbeschreibung_massnahmen" + # on these codes, so drop the rows. + codes_dropped = list(set(npi_codes_prior_data).difference( + npi_codes_prior)) + if len(codes_dropped) > 0: + df_npis_old = df_npis_old[~df_npis_old[dd.EngEng['npiCode']].isin( + codes_dropped)].reset_index() + + # sort NPI codes according to numeric values (argsort gives indices + # in input list to be used for sorted array) + npi_codes_sorting = np.argsort(npi_codes_prior.values) + npi_codes = list(npi_codes_prior[npi_codes_sorting]) if fine_resolution > 0: - # for subcategories, description is in "Beschreibung" column; "Variable" - # column is repeated after ";" sign (except for 6 first rows where some - # error is probably present) + # for subcategories, description is in "Beschreibung" column; The + # "Variable" column is repeated after the ";" sign + # (except for 6 first rows where there is probably some error) npi_desc = list(df_npis_desc["Beschreibung"][npi_codes_sorting]) - # stupid test start + # Check for consistent naming in descriptions + # Errors are known for the first 6 rows dummy_a = list(df_npis_desc["Variable"][npi_codes_sorting]) dummy_b = df_npis_desc["Beschreibung"][npi_codes_sorting] dummy_c = [str(x).split("; ")[1] for x in dummy_b] @@ -358,11 +368,11 @@ def transform_npi_data(fine_resolution=2, if not dummy_a[i] == dummy_c[i]: errors.append(i) if not errors == [0, 1, 2, 3, 4, 5]: - print("Additional error in naming...") - # stupid test end + print("Additional errors in consistent naming.") + # End of check - # correct error (mainly done for plotting reasons, otherwise naming - # column not strictly necessary) + # correct for consistent naming (mainly done for plotting reasons, + # otherwise naming column not strictly necessary) for i in range(6): npi_desc[i] = npi_desc[i].split("; ")[0] + "; " + dummy_a[i] @@ -370,43 +380,27 @@ def transform_npi_data(fine_resolution=2, # extract variable names for main categories npi_desc = list(df_npis_desc["Variable"][npi_codes_sorting]) - # group incidence NPIs to remove product space of - # NPI x from_inc(incid_not_matter, 0, 10, 35, 50, 100) - if fine_resolution == 1: - npi_codes_noincind_dict = dict() - major_code = npi_codes[0] - for code in npi_codes: - if major_code in code: - npi_codes_noincind_dict[code] = major_code - else: - major_code = code - npi_codes_noincind_dict[code] = code - - # get unique list - npi_codes_incgrouped_list = sorted( - set(npi_codes_noincind_dict.values())) - - if not read_data: - # replace more detailed code names X_Y with major code X - df_npis_old[dd.EngEng['npiCode']] = df_npis_old[dd.EngEng[ - 'npiCode']].replace(npi_codes_noincind_dict) - - if fine_resolution == 1: - npi_codes_considered = npi_codes_incgrouped_list - else: - npi_codes_considered = npi_codes - # transform data from original format to desired format if not read_data: + # could be used to reduced codes to be considered + npi_codes_considered = npi_codes + # get county ids unique_geo_entities = geoger.get_county_ids() + # remove counties which do not exist anymore + if df_npis_old[ + ~df_npis_old[dd.EngEng['idCounty']].isin(unique_geo_entities)][ + dd.EngEng['idCounty']].unique() != 16056: + sys.exit('More than county of Eisenach removed') + df_npis_old = df_npis_old[df_npis_old[dd.EngEng['idCounty']].isin( + unique_geo_entities)] start_npi_cols = list( df_npis_old.columns).index( dd.EngEng['npiCode']) + 1 - # create new data frame for all NPIs given in the columns, resolved by - # county and day + # create new data frame for all NPIs given in the columns, + # resolved by county and day df_npis = pd.DataFrame( columns=[dd.EngEng['date']] + [dd.EngEng['idCounty']] + npi_codes_considered) @@ -424,6 +418,7 @@ def transform_npi_data(fine_resolution=2, dates_new = [datetime.strptime(old_date, "d%Y%m%d") for old_date in str_dates] + # check for missing dates date_diff = [ (dates_new[i + 1] - dates_new[i]).days for i in range(len(dates_new) - 1)] @@ -434,15 +429,17 @@ def transform_npi_data(fine_resolution=2, print( "\t - From " + str(dates_new[i] + timedelta(1)) + " until " + str(dates_new[i] + timedelta(date_diff[i] - 1))) + sys.exit('Exiting. Dates missing in data frame.') # get RKI infectious numbers to find dates where incidence-dependent # NPIs were active if fine_resolution > 0: - df_infec_rki = pd.read_json( - 'data/pydata/Germany/all_county_all_dates_repdate_rki.json') + df_infec_rki = pd.read_json(os.path.join( + directory, 'all_county_all_dates_repdate_rki.json')) # iterate over countyIDs - counters = np.zeros(3) + counters = np.zeros(4) # time counter for output only + cid = 0 for countyID in [3101]: # unique_geo_entities: # get county-local data frame @@ -450,18 +447,7 @@ def transform_npi_data(fine_resolution=2, df_local_old = df_npis_old[df_npis_old[dd.EngEng['idCounty']] == countyID].copy() - if fine_resolution == 1: - # group by incidence (former codes X1_Y, X1_Z where transformed - # to X1, X2) and take max value - df_local_old = df_local_old.groupby(dd.EngEng['npiCode']).max() - # insert aggregated NPI code column - df_local_old.insert( - loc=start_npi_cols - 1, column=dd.EngEng['npiCode'], - value=df_local_old.index) - - # remove potential rows that do not have any explanation in - # additional explanation sheet - # (e.g., M22-M24 main categories seem to be placeholders) + # remove potential rows of which codes are not in npi_codes_considered npi_rows = [i in npi_codes_considered for i in df_local_old[dd.EngEng['npiCode']]] @@ -470,6 +456,7 @@ def transform_npi_data(fine_resolution=2, npi_rows].to_list() # get indices of rows for the NPI codes as in the sorted npi_codes list + # may be superfluous if NPI code rows are sorted correctly npi_code_rows_to_sorted = [ npi_codes_ordered_as_rows.index(i) for i in npi_codes_considered] @@ -490,22 +477,47 @@ def transform_npi_data(fine_resolution=2, df_local_new[npi_codes_considered] = np.transpose( npi_vals.iloc[npi_code_rows_to_sorted, :].values) - counters[0] += time.perf_counter()-start_time + counters[cid] += time.perf_counter()-start_time + cid += 1 start_time = time.perf_counter() - # replace -99 ("not implemented anymore") by 0 ("not implemented") + # replace -99 ("not used anymore") by 0 ("not used") df_local_new[npi_codes_considered] = df_local_new[npi_codes_considered].replace( -99, 0) - # replace 2,3,4,5 ("implemented by ...") by 1 ("implemented") + # replace 2,3,4,5 ("mentioned in ...") by 1 ("mentioned") df_local_new[npi_codes_considered] = df_local_new[npi_codes_considered].replace([ 2, 3, 4, 5], 1) - counters[1] += time.perf_counter()-start_time + counters[cid] += time.perf_counter()-start_time + cid += 1 + + ### evaluate NPIs mentioned with respect to confirmed cases ### + # values > 0 + # - for NPIs independent of new infections mean "mentioned" = "active" + # - for NPIs dependent on incidence "mentioned" does not mean + # active and evaluation has to be conducted against confirmed + # infections to determine whether the NPI was active + start_time = time.perf_counter() + # TODO + counters[cid] += time.perf_counter()-start_time + cid += 1 + ### ### start_time = time.perf_counter() df_npis = df_npis.append(df_local_new.copy()) - counters[2] += time.perf_counter()-start_time + counters[cid] += time.perf_counter()-start_time + cid += 1 + + # TODO: aggregation now here + if fine_resolution == 1: + # group by incidence (former codes X1_Y, X1_Z were transformed + # to X1, X2) and take max value + df_local_old = df_local_old.groupby(dd.EngEng['npiCode']).max() + # insert aggregated NPI code column + df_local_old.insert( + loc=start_npi_cols - 1, column=dd.EngEng['npiCode'], + value=df_local_old.index) # kita # figcode = ["M03_0" + str(i)+str(j) for i in range(10,70,10) for j in [''] + ['_'+str(k) for k in range(1,6)]] @@ -518,6 +530,35 @@ def transform_npi_data(fine_resolution=2, print(counters) + # TODO: aggregation now here + # group incidence NPIs to remove product space of + # NPI x active_from_inc (with values "incidence does not matter", and + # incidence 0, 10, 35, 50, 100) + if fine_resolution == 1: + # create hash table from subcode/subcategory to parental or main code/main category + npi_codes_to_maincode_map = dict() + major_code = npi_codes[0] + for code in npi_codes: + if major_code in code: + npi_codes_to_maincode_map[code] = major_code + else: + major_code = code + npi_codes_to_maincode_map[code] = code + + # get unique list of main codes + npi_codes_incgrouped_list = sorted( + set(npi_codes_to_maincode_map.values())) + + if not read_data: + # replace more detailed code names X_Y with major code X + df_npis_old[dd.EngEng['npiCode']] = df_npis_old[dd.EngEng[ + 'npiCode']].replace(npi_codes_to_maincode_map) + + if fine_resolution == 1: + npi_codes_considered = npi_codes_incgrouped_list + else: + npi_codes_considered = npi_codes + # reset index and drop old index column df_npis.reset_index(inplace=True) try: @@ -938,7 +979,7 @@ def main(): """! Main program entry.""" # arg_dict = gd.cli("testing") - transform_npi_data(fine_resolution=1, read_data=False, make_plot=True) + transform_npi_data(fine_resolution=2, read_data=False, make_plot=True) if __name__ == "__main__": From 888d524eb556765e81caf459f095f8733edccc9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Wed, 6 Apr 2022 19:34:45 +0200 Subject: [PATCH 005/104] small redesign and further WIP on NPI activation --- .../memilio/epidata/defaultDict.py | 3 +- .../memilio/epidata/transformNPIData.py | 93 +++++++++++++------ 2 files changed, 69 insertions(+), 27 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/defaultDict.py b/pycode/memilio-epidata/memilio/epidata/defaultDict.py index 107a2290a5..8f1c9b6517 100644 --- a/pycode/memilio-epidata/memilio/epidata/defaultDict.py +++ b/pycode/memilio-epidata/memilio/epidata/defaultDict.py @@ -103,7 +103,8 @@ 'nuts3': 'NUTS3', 'total_volume': 'Unique_trips', 'region_name': 'County', - 'region_id': 'ID_County' + 'region_id': 'ID_County', + 'desc': 'description' } GerEng = { diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index 9b94c99392..464de4cb63 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -239,7 +239,7 @@ def transform_npi_data(fine_resolution=2, df_npis_old = pd.read_csv( os.path.join( directory, 'kr_massnahmen_unterkategorien.csv'), - sep=',') + sep=',', nrows=1248) # 1248 for debugging, only reading Flensburg except FileNotFoundError: print_manual_download( 'kr_massnahmen_unterkategorien.csv', @@ -300,7 +300,7 @@ def transform_npi_data(fine_resolution=2, raise FileNotFoundError # get existing codes that are used (in df_npis_old M22-M24 are empty) - npi_codes_prior = df_npis_desc.Variablenname + npi_codes_prior = df_npis_desc['Variablenname'] # correct differences in codes between data sheet and explanation sheet if fine_resolution > 0: @@ -373,25 +373,60 @@ def transform_npi_data(fine_resolution=2, # correct for consistent naming (mainly done for plotting reasons, # otherwise naming column not strictly necessary) - for i in range(6): + for i in range(errors[0], errors[-1]+1): npi_desc[i] = npi_desc[i].split("; ")[0] + "; " + dummy_a[i] else: # extract variable names for main categories npi_desc = list(df_npis_desc["Variable"][npi_codes_sorting]) + # NPIs groups codes and description to ensure that both are ordered + # the same way + npis_dummy = {dd.EngEng['npiCode']: npi_codes, dd.EngEng['desc']: npi_desc} + npis = pd.DataFrame(npis_dummy) + + # extract incidence-threshold for NPIs + if fine_resolution > 0: + npi_incid_start = dict() + for i in range(len(npis)): + incid_threshold = 1e10 + if npis.loc[i, dd.EngEng['desc']].split(' ')[0] == 'Unabhängig': + # set -1 for incidence-independent NPIs + incid_threshold = -1 + elif npis.loc[i, dd.EngEng['desc']].split(' ')[0] == 'Ab': + incid_threshold = int( + npis.loc[i, dd.EngEng['desc']].split(' ')[1]) + else: + sys.exit('Error in description file. NPI activation can not ' + 'be computed. Exiting.') + npi_incid_start[npis.loc[i, dd.EngEng['npiCode']] + ] = incid_threshold + + # get all incidence thresholds + incidence_thresholds = sorted(set(npi_incid_start.values())) + # transform data from original format to desired format if not read_data: - # could be used to reduced codes to be considered - npi_codes_considered = npi_codes + # could be used to reduced NPIs to be considered + # NEITHER used NOR tested with subset of NPIs so far. + npis_considered = npis.copy() + incidence_thresholds_to_npis = dict( + zip(incidence_thresholds, [[] for i in range(len(incidence_thresholds))])) + for i in range(len(npis_considered)): + incval = npi_incid_start[npis_considered.loc + [i, dd.EngEng['npiCode']]] + incidence_thresholds_to_npis[incval].append(i) # get county ids unique_geo_entities = geoger.get_county_ids() - # remove counties which do not exist anymore - if df_npis_old[ - ~df_npis_old[dd.EngEng['idCounty']].isin(unique_geo_entities)][ - dd.EngEng['idCounty']].unique() != 16056: - sys.exit('More than county of Eisenach removed') + # check if more than the county of Eisenach would be removed with + # current county list + counties_removed = df_npis_old[ + ~df_npis_old[dd.EngEng['idCounty']].isin(unique_geo_entities)][ + dd.EngEng['idCounty']].unique() + if len(counties_removed) == 1 and counties_removed[0] != 16056: + sys.exit('Error. Other counties than that of Eisenach were removed.') + # remove rows for Eisenach df_npis_old = df_npis_old[df_npis_old[dd.EngEng['idCounty']].isin( unique_geo_entities)] @@ -403,14 +438,14 @@ def transform_npi_data(fine_resolution=2, # resolved by county and day df_npis = pd.DataFrame( columns=[dd.EngEng['date']] + [dd.EngEng['idCounty']] + - npi_codes_considered) + list(npis_considered[dd.EngEng['npiCode']])) # convert NPI data from object to int such that correlations can be # computed df_npis = df_npis.astype(dict( zip( [dd.EngEng['date']] + [dd.EngEng['idCounty']] + - npi_codes_considered, ['str', 'int'] + - ['int' for i in npi_codes_considered]))) + list(npis_considered[dd.EngEng['npiCode']]), ['str', 'int'] + + ['int' for i in npis_considered[dd.EngEng['npiCode']]]))) # store string dates 'dYYYYMMDD' in list before parsing str_dates = list(df_npis_old.iloc[:, start_npi_cols:].columns) @@ -436,11 +471,15 @@ def transform_npi_data(fine_resolution=2, if fine_resolution > 0: df_infec_rki = pd.read_json(os.path.join( directory, 'all_county_all_dates_repdate_rki.json')) + df_population = pd.read_json( + directory + "county_current_population.json") + # compute incidence based on previous data frames + # TODO # iterate over countyIDs counters = np.zeros(4) # time counter for output only cid = 0 - for countyID in [3101]: # unique_geo_entities: + for countyID in [1001]: # unique_geo_entities: # get county-local data frame start_time = time.perf_counter() @@ -448,7 +487,7 @@ def transform_npi_data(fine_resolution=2, == countyID].copy() # remove potential rows of which codes are not in npi_codes_considered - npi_rows = [i in npi_codes_considered + npi_rows = [i in npis_considered[dd.EngEng['npiCode']].values for i in df_local_old[dd.EngEng['npiCode']]] # get list of NPI codes, ordered as the rows in the current data frame @@ -459,7 +498,7 @@ def transform_npi_data(fine_resolution=2, # may be superfluous if NPI code rows are sorted correctly npi_code_rows_to_sorted = [ npi_codes_ordered_as_rows.index(i) for i in - npi_codes_considered] + npis_considered[dd.EngEng['npiCode']].values] # access NPI values matrix and store it as integers npi_vals = df_local_old.iloc[npi_rows, start_npi_cols:].astype(int) @@ -467,14 +506,14 @@ def transform_npi_data(fine_resolution=2, # create columns for date, county ID and NPI code df_local_new = pd.DataFrame( columns=[dd.EngEng['date']] + [dd.EngEng['idCounty']] + - npi_codes_considered) + list(npis_considered[dd.EngEng['npiCode']])) # fill in NPI values by transposing from columns to rows df_local_new[dd.EngEng['date']] = dates_new df_local_new[dd.EngEng['idCounty']] = countyID # possible resorting of rows such that they are sorted according to # a literal sorting of the code strings - df_local_new[npi_codes_considered] = np.transpose( + df_local_new[npis_considered[dd.EngEng['npiCode']]] = np.transpose( npi_vals.iloc[npi_code_rows_to_sorted, :].values) counters[cid] += time.perf_counter()-start_time @@ -483,11 +522,11 @@ def transform_npi_data(fine_resolution=2, start_time = time.perf_counter() # replace -99 ("not used anymore") by 0 ("not used") - df_local_new[npi_codes_considered] = df_local_new[npi_codes_considered].replace( - -99, 0) + df_local_new[npis_considered[dd.EngEng['npiCode']] + ] = df_local_new[npis_considered[dd.EngEng['npiCode']]].replace(-99, 0) # replace 2,3,4,5 ("mentioned in ...") by 1 ("mentioned") - df_local_new[npi_codes_considered] = df_local_new[npi_codes_considered].replace([ - 2, 3, 4, 5], 1) + df_local_new[npis_considered[dd.EngEng['npiCode']] + ] = df_local_new[npis_considered[dd.EngEng['npiCode']]].replace([2, 3, 4, 5], 1) counters[cid] += time.perf_counter()-start_time cid += 1 @@ -500,6 +539,8 @@ def transform_npi_data(fine_resolution=2, # infections to determine whether the NPI was active start_time = time.perf_counter() # TODO + for incidval, npi_indices in incidence_thresholds_to_npis.items(): + print(incidval) counters[cid] += time.perf_counter()-start_time cid += 1 ### ### @@ -742,14 +783,14 @@ def transform_npi_data(fine_resolution=2, if npi_codes_considered[i] in npi_codes_empty: npi_unused_indices.append(i) npi_unused_indices_all.append( - npi_codes.index(npi_codes_considered[i])) + npis[dd.EngEng['npiCode']].index(npi_codes_considered[i])) else: npi_used_indices.append(i) npi_used_indices_all.append( - npi_codes.index(npi_codes_considered[i])) + npis[dd.EngEng['npiCode']].index(npi_codes_considered[i])) - npis_unused = np.array(npi_desc)[npi_unused_indices_all] - npis_used = np.array(npi_desc)[npi_used_indices_all] + npis_unused = np.array(npis[dd.EngEng['desc']])[npi_unused_indices_all] + npis_used = np.array(npis[dd.EngEng['desc']])[npi_used_indices_all] npi_codes_used = list(np.array(npi_codes_considered)[npi_used_indices]) npi_codes_unused = list( np.array(npi_codes_considered)[npi_unused_indices]) From dc2e9b20643f66e1bb401036b4565fd72df30261 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Thu, 7 Apr 2022 08:53:17 +0200 Subject: [PATCH 006/104] Compute incidence per county --- pycode/memilio-epidata/memilio/epidata/defaultDict.py | 3 ++- .../memilio/epidata/transformNPIData.py | 11 +++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/defaultDict.py b/pycode/memilio-epidata/memilio/epidata/defaultDict.py index 8f1c9b6517..dd5c890547 100644 --- a/pycode/memilio-epidata/memilio/epidata/defaultDict.py +++ b/pycode/memilio-epidata/memilio/epidata/defaultDict.py @@ -104,7 +104,8 @@ 'total_volume': 'Unique_trips', 'region_name': 'County', 'region_id': 'ID_County', - 'desc': 'description' + 'desc': 'Description', + 'incidence': 'Incidence' } GerEng = { diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index 464de4cb63..c8beb5d5a0 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -473,14 +473,21 @@ def transform_npi_data(fine_resolution=2, directory, 'all_county_all_dates_repdate_rki.json')) df_population = pd.read_json( directory + "county_current_population.json") - # compute incidence based on previous data frames - # TODO # iterate over countyIDs counters = np.zeros(4) # time counter for output only cid = 0 for countyID in [1001]: # unique_geo_entities: + # compute incidence based on previous data frames + df_infec_local = df_infec_rki[df_infec_rki[dd.EngEng['idCounty']] == countyID].copy( + ) + pop_local = df_population.loc[df_population[dd.EngEng['idCounty']] + == countyID, dd.EngEng['population']].values[0] + incidence_local = df_infec_local[dd.EngEng['confirmed']].diff( + periods=7).fillna(df_infec_local[dd.EngEng['confirmed']]) + df_infec_local['Incidence'] = incidence_local / pop_local * 100000 + # get county-local data frame start_time = time.perf_counter() df_local_old = df_npis_old[df_npis_old[dd.EngEng['idCounty']] From 31e8acaf6573a88ae9734faec63fe3f714b86b6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Thu, 7 Apr 2022 17:05:26 +0200 Subject: [PATCH 007/104] Activation of NPIs implmented; without deactivation by stronger NPIs; without aggregation --- .../memilio/epidata/defaultDict.py | 2 +- .../memilio/epidata/transformNPIData.py | 68 ++++++++++++++++--- 2 files changed, 58 insertions(+), 12 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/defaultDict.py b/pycode/memilio-epidata/memilio/epidata/defaultDict.py index dd5c890547..4c514f6701 100644 --- a/pycode/memilio-epidata/memilio/epidata/defaultDict.py +++ b/pycode/memilio-epidata/memilio/epidata/defaultDict.py @@ -43,7 +43,7 @@ 'make_plot': False, 'out_folder': default_file_path, 'update_data': False, - 'start_date': date(2020, 4, 24), + 'start_date': date(2020, 1, 1), 'end_date': date.today(), 'split_berlin': False, 'impute_dates': False, diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index c8beb5d5a0..6957a55505 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -235,6 +235,10 @@ def transform_npi_data(fine_resolution=2, if not read_data: if fine_resolution > 0: + # defines delay in number of days between exceeding + # incidence threshold and NPI getting active + npi_activation_delay = 0 + try: df_npis_old = pd.read_csv( os.path.join( @@ -466,27 +470,49 @@ def transform_npi_data(fine_resolution=2, str(dates_new[i] + timedelta(date_diff[i] - 1))) sys.exit('Exiting. Dates missing in data frame.') + min_date = [] + max_date = [] + # get RKI infectious numbers to find dates where incidence-dependent # NPIs were active if fine_resolution > 0: df_infec_rki = pd.read_json(os.path.join( directory, 'all_county_all_dates_repdate_rki.json')) + df_infec_rki[dd.EngEng['date']] = pd.to_datetime( + df_infec_rki[dd.EngEng['date']]) df_population = pd.read_json( directory + "county_current_population.json") + min_date.append( + df_infec_rki[dd.EngEng['date']].min().to_pydatetime()) + max_date.append( + df_infec_rki[dd.EngEng['date']].max().to_pydatetime()) + + # adapt time series according to available dates and start_date, + # end_date input parameter + start_date_new = max( + min_date + [min(dates_new), pd.to_datetime(start_date)]) + end_date_new = min( + max_date + [max(dates_new), + pd.to_datetime(end_date)]) # iterate over countyIDs counters = np.zeros(4) # time counter for output only cid = 0 for countyID in [1001]: # unique_geo_entities: - # compute incidence based on previous data frames - df_infec_local = df_infec_rki[df_infec_rki[dd.EngEng['idCounty']] == countyID].copy( - ) - pop_local = df_population.loc[df_population[dd.EngEng['idCounty']] - == countyID, dd.EngEng['population']].values[0] - incidence_local = df_infec_local[dd.EngEng['confirmed']].diff( - periods=7).fillna(df_infec_local[dd.EngEng['confirmed']]) - df_infec_local['Incidence'] = incidence_local / pop_local * 100000 + if fine_resolution > 0: + # compute incidence based on previous data frames + df_infec_local = df_infec_rki[df_infec_rki[dd.EngEng['idCounty']] == countyID].copy( + ) + pop_local = df_population.loc[df_population[dd.EngEng['idCounty']] + == countyID, dd.EngEng['population']].values[0] + incidence_local = df_infec_local[dd.EngEng['confirmed']].diff( + periods=7).fillna(df_infec_local[dd.EngEng['confirmed']]) + df_infec_local['Incidence'] = incidence_local / pop_local * 100000 + + # cut infection information at start_date_new and end_date_new + df_infec_local = df_infec_local.loc[(df_infec_local[dd.EngEng['date']] >= start_date_new) & ( + df_infec_local[dd.EngEng['date']] <= end_date_new), :].reset_index() # get county-local data frame start_time = time.perf_counter() @@ -545,9 +571,29 @@ def transform_npi_data(fine_resolution=2, # active and evaluation has to be conducted against confirmed # infections to determine whether the NPI was active start_time = time.perf_counter() - # TODO - for incidval, npi_indices in incidence_thresholds_to_npis.items(): - print(incidval) + if fine_resolution > 0: + # cut NPI information at start_date_new and end_date_new + df_local_new = df_local_new.loc[(df_local_new[dd.EngEng['date']] >= start_date_new) & ( + df_local_new[dd.EngEng['date']] <= end_date_new), :].reset_index() + # get index of first NPI column in local data frame + npis_idx_start = list( + df_local_new.columns).index( + npis_considered[dd.EngEng['npiCode']][0]) + + # iterate through all NPIs and activate if incidence threshold + # is exceeded + for incidval, npi_indices in incidence_thresholds_to_npis.items(): + if incidval >= 0: + int_active = ( + df_infec_local['Incidence'] >= incidval).astype(int) + # multiply rows of data frame by either 1 if threshold + # passed (i.e., mentioned NPI is active) or zero + # (i.e., mentioned NPI is not active) + # 'mul' multiplies the original data frame row by row + # with the respective value in int_active + df_local_new.iloc[:, npis_idx_start + np.array(npi_indices)] \ + = df_local_new.iloc[:, npis_idx_start + np.array(npi_indices)].mul(int_active, axis=0) + counters[cid] += time.perf_counter()-start_time cid += 1 ### ### From f31ba8aab962df2c468dda10ef73a2f7ed45e45f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Fri, 8 Apr 2022 12:59:50 +0200 Subject: [PATCH 008/104] Compute incidence value, aggregate NPIs, use delay possible --- .../memilio/epidata/transformNPIData.py | 326 ++++++++++-------- 1 file changed, 184 insertions(+), 142 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index 6957a55505..edcb584317 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -243,7 +243,7 @@ def transform_npi_data(fine_resolution=2, df_npis_old = pd.read_csv( os.path.join( directory, 'kr_massnahmen_unterkategorien.csv'), - sep=',', nrows=1248) # 1248 for debugging, only reading Flensburg + sep=',') # , nrows=1248) # 1248 for debugging, only reading Flensburg except FileNotFoundError: print_manual_download( 'kr_massnahmen_unterkategorien.csv', @@ -258,7 +258,7 @@ def transform_npi_data(fine_resolution=2, 'M24_030', 'M24_040', 'M24_050', 'M24_060'] for tcode in test_codes: for i in [''] + ["_" + str(i) for i in range(1, 6)]: - if(df_npis_old[df_npis_old.NPI_code == tcode+i].iloc[:, 6:].max().max() > 0): + if(df_npis_old[df_npis_old[dd.EngEng['npiCode']] == tcode+i].iloc[:, 6:].max().max() > 0): print(tcode+i + " used.") # end check @@ -389,35 +389,63 @@ def transform_npi_data(fine_resolution=2, npis_dummy = {dd.EngEng['npiCode']: npi_codes, dd.EngEng['desc']: npi_desc} npis = pd.DataFrame(npis_dummy) - # extract incidence-threshold for NPIs - if fine_resolution > 0: - npi_incid_start = dict() - for i in range(len(npis)): - incid_threshold = 1e10 - if npis.loc[i, dd.EngEng['desc']].split(' ')[0] == 'Unabhängig': - # set -1 for incidence-independent NPIs - incid_threshold = -1 - elif npis.loc[i, dd.EngEng['desc']].split(' ')[0] == 'Ab': - incid_threshold = int( - npis.loc[i, dd.EngEng['desc']].split(' ')[1]) - else: - sys.exit('Error in description file. NPI activation can not ' - 'be computed. Exiting.') - npi_incid_start[npis.loc[i, dd.EngEng['npiCode']] - ] = incid_threshold - - # get all incidence thresholds - incidence_thresholds = sorted(set(npi_incid_start.values())) - # transform data from original format to desired format if not read_data: - # could be used to reduced NPIs to be considered - # NEITHER used NOR tested with subset of NPIs so far. - npis_considered = npis.copy() + # prepare grouping of NPIs to reduce product space of + # NPI x active_from_inc (with values "incidence does not matter", and + # incidence 0, 10, 35, 50, 100) to NPI + if fine_resolution == 1: + # create hash table from parental or main code/main category + # to list of subcodes/subcategories + maincode_to_npicodes_map = dict() + npicodes_to_maincode_map = dict() + major_code = npi_codes[0] + maincode_to_npicodes_map[major_code] = [] + for code in npi_codes: + npicodes_to_maincode_map[code] = major_code + if major_code in code: + maincode_to_npicodes_map[major_code].append(code) + else: + major_code = code + maincode_to_npicodes_map[major_code] = [code] + + npi_codes_aggregated = [] + for main_code in maincode_to_npicodes_map.keys(): + if main_code.count('_') > 1: + sys.exit('Error. Subcode assigned as main code.') + npi_codes_aggregated.append(main_code) + + npis_final = npis[npis[dd.EngEng['npiCode']].isin( + npi_codes_aggregated)].reset_index() + else: + npis_final = npis + + # extract incidence-threshold for NPIs + if fine_resolution > 0: + npi_incid_start = dict() + for i in range(len(npis)): + incid_threshold = 1e10 + if npis.loc[i, dd.EngEng['desc']].split(' ')[0] == 'Unabhängig': + # set -1 for incidence-independent NPIs + incid_threshold = -1 + elif npis.loc[i, dd.EngEng['desc']].split(' ')[0] == 'Ab': + incid_threshold = int( + npis.loc[i, dd.EngEng['desc']].split(' ')[1]) + else: + sys.exit( + 'Error in description file. NPI activation can not ' + 'be computed. Exiting.') + npi_incid_start[npis.loc[i, dd.EngEng['npiCode']] + ] = incid_threshold + + # get all incidence thresholds + incidence_thresholds = sorted(set(npi_incid_start.values())) + + # create hash map from thresholds to NPI indices incidence_thresholds_to_npis = dict( zip(incidence_thresholds, [[] for i in range(len(incidence_thresholds))])) - for i in range(len(npis_considered)): - incval = npi_incid_start[npis_considered.loc + for i in range(len(npis)): + incval = npi_incid_start[npis.loc [i, dd.EngEng['npiCode']]] incidence_thresholds_to_npis[incval].append(i) @@ -442,14 +470,14 @@ def transform_npi_data(fine_resolution=2, # resolved by county and day df_npis = pd.DataFrame( columns=[dd.EngEng['date']] + [dd.EngEng['idCounty']] + - list(npis_considered[dd.EngEng['npiCode']])) + list(npis_final[dd.EngEng['npiCode']])) # convert NPI data from object to int such that correlations can be # computed df_npis = df_npis.astype(dict( zip( [dd.EngEng['date']] + [dd.EngEng['idCounty']] + - list(npis_considered[dd.EngEng['npiCode']]), ['str', 'int'] + - ['int' for i in npis_considered[dd.EngEng['npiCode']]]))) + list(npis_final[dd.EngEng['npiCode']]), ['str', 'int'] + + ['int' for i in npis_final[dd.EngEng['npiCode']]]))) # store string dates 'dYYYYMMDD' in list before parsing str_dates = list(df_npis_old.iloc[:, start_npi_cols:].columns) @@ -497,8 +525,10 @@ def transform_npi_data(fine_resolution=2, # iterate over countyIDs counters = np.zeros(4) # time counter for output only - cid = 0 - for countyID in [1001]: # unique_geo_entities: + countyidx = 0 + for countyID in unique_geo_entities: + cid = 0 + countyidx += 1 if fine_resolution > 0: # compute incidence based on previous data frames @@ -520,7 +550,7 @@ def transform_npi_data(fine_resolution=2, == countyID].copy() # remove potential rows of which codes are not in npi_codes_considered - npi_rows = [i in npis_considered[dd.EngEng['npiCode']].values + npi_rows = [i in npis[dd.EngEng['npiCode']].values for i in df_local_old[dd.EngEng['npiCode']]] # get list of NPI codes, ordered as the rows in the current data frame @@ -531,7 +561,7 @@ def transform_npi_data(fine_resolution=2, # may be superfluous if NPI code rows are sorted correctly npi_code_rows_to_sorted = [ npi_codes_ordered_as_rows.index(i) for i in - npis_considered[dd.EngEng['npiCode']].values] + npis[dd.EngEng['npiCode']].values] # access NPI values matrix and store it as integers npi_vals = df_local_old.iloc[npi_rows, start_npi_cols:].astype(int) @@ -539,14 +569,14 @@ def transform_npi_data(fine_resolution=2, # create columns for date, county ID and NPI code df_local_new = pd.DataFrame( columns=[dd.EngEng['date']] + [dd.EngEng['idCounty']] + - list(npis_considered[dd.EngEng['npiCode']])) + list(npis[dd.EngEng['npiCode']])) # fill in NPI values by transposing from columns to rows df_local_new[dd.EngEng['date']] = dates_new df_local_new[dd.EngEng['idCounty']] = countyID # possible resorting of rows such that they are sorted according to # a literal sorting of the code strings - df_local_new[npis_considered[dd.EngEng['npiCode']]] = np.transpose( + df_local_new[npis[dd.EngEng['npiCode']]] = np.transpose( npi_vals.iloc[npi_code_rows_to_sorted, :].values) counters[cid] += time.perf_counter()-start_time @@ -555,11 +585,11 @@ def transform_npi_data(fine_resolution=2, start_time = time.perf_counter() # replace -99 ("not used anymore") by 0 ("not used") - df_local_new[npis_considered[dd.EngEng['npiCode']] - ] = df_local_new[npis_considered[dd.EngEng['npiCode']]].replace(-99, 0) + df_local_new[npis[dd.EngEng['npiCode']] + ] = df_local_new[npis[dd.EngEng['npiCode']]].replace(-99, 0) # replace 2,3,4,5 ("mentioned in ...") by 1 ("mentioned") - df_local_new[npis_considered[dd.EngEng['npiCode']] - ] = df_local_new[npis_considered[dd.EngEng['npiCode']]].replace([2, 3, 4, 5], 1) + df_local_new[npis[dd.EngEng['npiCode']] + ] = df_local_new[npis[dd.EngEng['npiCode']]].replace([2, 3, 4, 5], 1) counters[cid] += time.perf_counter()-start_time cid += 1 @@ -578,14 +608,22 @@ def transform_npi_data(fine_resolution=2, # get index of first NPI column in local data frame npis_idx_start = list( df_local_new.columns).index( - npis_considered[dd.EngEng['npiCode']][0]) + npis[dd.EngEng['npiCode']][0]) # iterate through all NPIs and activate if incidence threshold # is exceeded - for incidval, npi_indices in incidence_thresholds_to_npis.items(): - if incidval >= 0: + for incidvalthrsh, npi_indices in incidence_thresholds_to_npis.items(): + if incidvalthrsh >= 0: + local_incid = df_infec_local['Incidence'].copy() + if npi_activation_delay > 0: + # shift values to npi_activation_delay days later + local_incid.iloc[npi_activation_delay: + ] = local_incid.iloc[0:-npi_activation_delay].values + # take constant value of day 0 for first delay days + local_incid.iloc[:npi_activation_delay] = local_incid.iloc[0] + # compare incidence against threshold int_active = ( - df_infec_local['Incidence'] >= incidval).astype(int) + local_incid >= incidvalthrsh).astype(int) # multiply rows of data frame by either 1 if threshold # passed (i.e., mentioned NPI is active) or zero # (i.e., mentioned NPI is not active) @@ -594,6 +632,18 @@ def transform_npi_data(fine_resolution=2, df_local_new.iloc[:, npis_idx_start + np.array(npi_indices)] \ = df_local_new.iloc[:, npis_idx_start + np.array(npi_indices)].mul(int_active, axis=0) + # reduction of factor space NPI x incidence threshold to NPI + # by max aggregation of all incidence threshold columns per NPI + if fine_resolution == 1: + for main_code, codes_group in maincode_to_npicodes_map.items(): + # group by incidence (former codes X1_Y, X1_Z were transformed + # to X1, X2) and write max value to main code column + df_local_new.loc[:, main_code] = df_local_new.loc[:, codes_group].max( + axis=1) + # remove subcategory columns + df_local_new = df_local_new.loc[:, [ + dd.EngEng['date'], dd.EngEng['idCounty']] + npi_codes_aggregated].copy() + counters[cid] += time.perf_counter()-start_time cid += 1 ### ### @@ -603,16 +653,6 @@ def transform_npi_data(fine_resolution=2, counters[cid] += time.perf_counter()-start_time cid += 1 - # TODO: aggregation now here - if fine_resolution == 1: - # group by incidence (former codes X1_Y, X1_Z were transformed - # to X1, X2) and take max value - df_local_old = df_local_old.groupby(dd.EngEng['npiCode']).max() - # insert aggregated NPI code column - df_local_old.insert( - loc=start_npi_cols - 1, column=dd.EngEng['npiCode'], - value=df_local_old.index) - # kita # figcode = ["M03_0" + str(i)+str(j) for i in range(10,70,10) for j in [''] + ['_'+str(k) for k in range(1,6)]] # # school @@ -622,36 +662,19 @@ def transform_npi_data(fine_resolution=2, # print('') # customPlot.plotList(df_npis.loc[:,"Date"], [df_npis.loc[:,bb] for bb in figcode], legend=figcode, title='asd', xlabel='asd', ylabel='ad', fig_name='asd') - print(counters) - - # TODO: aggregation now here - # group incidence NPIs to remove product space of - # NPI x active_from_inc (with values "incidence does not matter", and - # incidence 0, 10, 35, 50, 100) - if fine_resolution == 1: - # create hash table from subcode/subcategory to parental or main code/main category - npi_codes_to_maincode_map = dict() - major_code = npi_codes[0] - for code in npi_codes: - if major_code in code: - npi_codes_to_maincode_map[code] = major_code - else: - major_code = code - npi_codes_to_maincode_map[code] = code - - # get unique list of main codes - npi_codes_incgrouped_list = sorted( - set(npi_codes_to_maincode_map.values())) + # divide working time by completed number of counties and multiply + # by remaining number of counties to estimate time remaining + time_remain = sum( + counters) / countyidx * (len(unique_geo_entities) - countyidx) + # print progress + print( + 'Progress ' + str(countyidx) + ' / ' + + str(len(unique_geo_entities)) + '. Estimated time remaining: ' + + str(int(time_remain / 60)) + ' min.') - if not read_data: - # replace more detailed code names X_Y with major code X - df_npis_old[dd.EngEng['npiCode']] = df_npis_old[dd.EngEng[ - 'npiCode']].replace(npi_codes_to_maincode_map) - - if fine_resolution == 1: - npi_codes_considered = npi_codes_incgrouped_list - else: - npi_codes_considered = npi_codes + # print sub counters + print('Sub task counters are: ') + print(counters) # reset index and drop old index column df_npis.reset_index(inplace=True) @@ -669,123 +692,142 @@ def transform_npi_data(fine_resolution=2, str(counters[2]) + " sec") #### start validation #### - if fine_resolution > 1: + if fine_resolution == 2: # Cologne for M01a_010 and all dates (no changes) - dummy_old = df_npis_old[(df_npis_old.ID_County == 5315) & ( - df_npis_old.NPI_code == 'M01a_010')].values[0][start_npi_cols:] - dummy_new = df_npis.loc[df_npis.ID_County == + dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 5315) & ( + df_npis_old[dd.EngEng['npiCode']] == 'M01a_010')].values[0][start_npi_cols:] + dummy_new = df_npis.loc[df_npis[dd.EngEng['idCounty']] == 5315, 'M01a_010'].values print(abs(dummy_old-dummy_new).sum() == 0) # Flensburg for M05_120 and all dates ('2's become '1's) - dummy_old = df_npis_old[(df_npis_old.ID_County == 1001) & ( - df_npis_old.NPI_code == 'M05_120')].values[0][start_npi_cols:] - dummy_new = df_npis.loc[df_npis.ID_County == + dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 1001) & ( + df_npis_old[dd.EngEng['npiCode']] == 'M05_120')].values[0][start_npi_cols:] + dummy_new = df_npis.loc[df_npis[dd.EngEng['idCounty']] == 1001, 'M05_120'].values print(abs(dummy_old-dummy_new).sum() == 5) # Munich for M01a_010_4 and all dates (-99 becomes 0, 0 stays 0) - dummy_old = df_npis_old[(df_npis_old.ID_County == 9162) & ( - df_npis_old.NPI_code == 'M01a_010_4')].values[0][start_npi_cols:] - dummy_new = df_npis.loc[df_npis.ID_County == + dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 9162) & ( + df_npis_old[dd.EngEng['npiCode']] == 'M01a_010_4')].values[0][start_npi_cols:] + dummy_new = df_npis.loc[df_npis[dd.EngEng['idCounty']] == 9162, 'M01a_010_4'].values print(abs(dummy_old-dummy_new).sum() == 422*99) # Weimar for M12_030_3 and all dates (-99 becomes 0, 1 stays 1, 0 stays 0) - dummy_old = df_npis_old[(df_npis_old.ID_County == 16071) & ( - df_npis_old.NPI_code == 'M12_030_3')].values[0][start_npi_cols:] - dummy_new = df_npis.loc[df_npis.ID_County == + dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 16071) & ( + df_npis_old[dd.EngEng['npiCode']] == 'M12_030_3')].values[0][start_npi_cols:] + dummy_new = df_npis.loc[df_npis[dd.EngEng['idCounty']] == 16071, 'M12_030_3'].values print(abs(dummy_old-dummy_new).sum() == 422*99) # Berlin for M01b_020 and all dates (2 becomes 1, 1 stays 1, 0 stays 0) - dummy_old = df_npis_old[(df_npis_old.ID_County == 11000) & ( - df_npis_old.NPI_code == 'M01b_020')].values[0][start_npi_cols:] - dummy_new = df_npis.loc[df_npis.ID_County == + dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 11000) & ( + df_npis_old[dd.EngEng['npiCode']] == 'M01b_020')].values[0][start_npi_cols:] + dummy_new = df_npis.loc[df_npis[dd.EngEng['idCounty']] == 11000, 'M01b_020'].values print(abs(dummy_old-dummy_new).sum() == 82) # Segeberg for M02b_035 and all dates (2 -> 1, 3 -> 1, 5 -> 1) - dummy_old = df_npis_old[(df_npis_old.ID_County == 1060) & ( - df_npis_old.NPI_code == 'M02b_035')].values[0][start_npi_cols:] - dummy_new = df_npis.loc[df_npis.ID_County == + dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 1060) & ( + df_npis_old[dd.EngEng['npiCode']] == 'M02b_035')].values[0][start_npi_cols:] + dummy_new = df_npis.loc[df_npis[dd.EngEng['idCounty']] == 1060, 'M02b_035'].values print(abs(dummy_old-dummy_new).sum() == 151+2*53+4*22) # Steinfurt for M16_050 and all dates (4 -> 1, ...) - dummy_old = df_npis_old[(df_npis_old.ID_County == 5566) & ( - df_npis_old.NPI_code == 'M16_050')].values[0][start_npi_cols:] - dummy_new = df_npis.loc[df_npis.ID_County == + dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 5566) & ( + df_npis_old[dd.EngEng['npiCode']] == 'M16_050')].values[0][start_npi_cols:] + dummy_new = df_npis.loc[df_npis[dd.EngEng['idCounty']] == 5566, 'M16_050'].values print(abs(dummy_old-dummy_new).sum() == 32+2*20+3*22) - elif fine_resolution == 2: + elif fine_resolution == 1: + # replace more detailed code names X_Y with major code X + df_npis_old[dd.EngEng['npiCode']] = df_npis_old[dd.EngEng[ + 'npiCode']].replace(npicodes_to_maincode_map) + # Cologne for M01a_010 and all dates (no changes) - dummy_old = df_npis_old[(df_npis_old.ID_County == 5315) & ( - df_npis_old.NPI_code == 'M01a_010')].values[0][start_npi_cols:] + dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 5315) & ( + df_npis_old[dd.EngEng['npiCode']] == 'M01a_010')].values[0][start_npi_cols:] for subcode in range(1, 6): # add subcode values - dummy_old = np.maximum(dummy_old, df_npis_old[(df_npis_old.ID_County == 5315) & ( - df_npis_old.NPI_code == 'M01a_010')].values[subcode][start_npi_cols:]) - dummy_new = df_npis.loc[df_npis.ID_County == + dummy_old = np.maximum(dummy_old, df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 5315) & ( + df_npis_old[dd.EngEng['npiCode']] == 'M01a_010')].values[subcode][start_npi_cols:]) + dummy_new = df_npis.loc[df_npis[dd.EngEng['idCounty']] == 5315, 'M01a_010'].values print(abs(dummy_old-dummy_new).sum() == 0) # Flensburg for M05_120 and all dates ('2's become '1's) - dummy_old = df_npis_old[(df_npis_old.ID_County == 1001) & ( - df_npis_old.NPI_code == 'M05_120')].values[0][start_npi_cols:] + dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 1001) & ( + df_npis_old[dd.EngEng['npiCode']] == 'M05_120')].values[0][start_npi_cols:] for subcode in range(1, 6): # add subcode values - dummy_old = np.maximum(dummy_old, df_npis_old[(df_npis_old.ID_County == 1001) & ( - df_npis_old.NPI_code == 'M05_120')].values[subcode][start_npi_cols:]) - dummy_new = df_npis.loc[df_npis.ID_County == + dummy_old = np.maximum(dummy_old, + df_npis_old + [(df_npis_old + [dd.EngEng['idCounty']] == 1001) & + (df_npis_old[dd.EngEng['npiCode']] == + 'M05_120')].values[subcode] + [start_npi_cols:]) + dummy_new = df_npis.loc[df_npis[dd.EngEng['idCounty']] == 1001, 'M05_120'].values print(abs(dummy_old-dummy_new).sum() == 5) # Munich for M01a_010 and all dates (-99 becomes 0, 0 stays 0) - dummy_old = df_npis_old[(df_npis_old.ID_County == 9162) & ( - df_npis_old.NPI_code == 'M01a_010')].values[0][start_npi_cols:] + dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 9162) & ( + df_npis_old[dd.EngEng['npiCode']] == 'M01a_010')].values[0][start_npi_cols:] for subcode in range(1, 6): # add subcode values - dummy_old = np.maximum(dummy_old, df_npis_old[(df_npis_old.ID_County == 9162) & ( - df_npis_old.NPI_code == 'M01a_010')].values[subcode][start_npi_cols:]) - dummy_new = df_npis.loc[df_npis.ID_County == + dummy_old = np.maximum(dummy_old, df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 9162) & ( + df_npis_old[dd.EngEng['npiCode']] == 'M01a_010')].values[subcode][start_npi_cols:]) + dummy_new = df_npis.loc[df_npis[dd.EngEng['idCounty']] == 9162, 'M01a_010'].values print(abs(dummy_old-dummy_new).sum() == 0) # Weimar for M12_030_3 and all dates (-99 becomes 0, 1 stays 1, 0 stays 0) - dummy_old = df_npis_old[(df_npis_old.ID_County == 16071) & ( - df_npis_old.NPI_code == 'M12_030')].values[0][start_npi_cols:] + dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 16071) & ( + df_npis_old[dd.EngEng['npiCode']] == 'M12_030')].values[0][start_npi_cols:] for subcode in range(1, 6): # add subcode values - dummy_old = np.maximum(dummy_old, df_npis_old[(df_npis_old.ID_County == 16071) & ( - df_npis_old.NPI_code == 'M12_030')].values[subcode][start_npi_cols:]) - dummy_new = df_npis.loc[df_npis.ID_County == + dummy_old = np.maximum(dummy_old, + df_npis_old + [(df_npis_old + [dd.EngEng['idCounty']] == 16071) & + (df_npis_old[dd.EngEng['npiCode']] == + 'M12_030')].values[subcode] + [start_npi_cols:]) + dummy_new = df_npis.loc[df_npis[dd.EngEng['idCounty']] == 16071, 'M12_030'].values print(abs(dummy_old-dummy_new).sum() == 19) # Berlin for M01b_020 and all dates (2 becomes 1, 1 stays 1, 0 stays 0) - dummy_old = df_npis_old[(df_npis_old.ID_County == 11000) & ( - df_npis_old.NPI_code == 'M01b_020')].values[0][start_npi_cols:] + dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 11000) & ( + df_npis_old[dd.EngEng['npiCode']] == 'M01b_020')].values[0][start_npi_cols:] for subcode in range(1, 6): # add subcode values - dummy_old = np.maximum(dummy_old, df_npis_old[(df_npis_old.ID_County == 11000) & ( - df_npis_old.NPI_code == 'M01b_020')].values[subcode][start_npi_cols:]) - dummy_new = df_npis.loc[df_npis.ID_County == + dummy_old = np.maximum(dummy_old, df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 11000) & ( + df_npis_old[dd.EngEng['npiCode']] == 'M01b_020')].values[subcode][start_npi_cols:]) + dummy_new = df_npis.loc[df_npis[dd.EngEng['idCounty']] == 11000, 'M01b_020'].values print(abs(dummy_old-dummy_new).sum() == 82) # Segeberg for M02b_035 and all dates (2 -> 1, 3 -> 1, 5 -> 1) - dummy_old = df_npis_old[(df_npis_old.ID_County == 1060) & ( - df_npis_old.NPI_code == 'M02b_035')].values[0][start_npi_cols:] + dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 1060) & ( + df_npis_old[dd.EngEng['npiCode']] == 'M02b_035')].values[0][start_npi_cols:] for subcode in range(1, 6): # add subcode values - dummy_old = np.maximum(dummy_old, df_npis_old[(df_npis_old.ID_County == 1060) & ( - df_npis_old.NPI_code == 'M02b_035')].values[subcode][start_npi_cols:]) - dummy_new = df_npis.loc[df_npis.ID_County == + dummy_old = np.maximum(dummy_old, df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 1060) & ( + df_npis_old[dd.EngEng['npiCode']] == 'M02b_035')].values[subcode][start_npi_cols:]) + dummy_new = df_npis.loc[df_npis[dd.EngEng['idCounty']] == 1060, 'M02b_035'].values print(abs(dummy_old-dummy_new).sum() == 151+2*53+4*22) # Steinfurt for M16_050 and all dates (4 -> 1, ...) - dummy_old = df_npis_old[(df_npis_old.ID_County == 5566) & ( - df_npis_old.NPI_code == 'M16_050')].values[0][start_npi_cols:] + dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 5566) & ( + df_npis_old[dd.EngEng['npiCode']] == 'M16_050')].values[0][start_npi_cols:] for subcode in range(1, 6): # add subcode values - dummy_old = np.maximum(dummy_old, df_npis_old[(df_npis_old.ID_County == 5566) & ( - df_npis_old.NPI_code == 'M16_050')].values[subcode][start_npi_cols:]) - dummy_new = df_npis.loc[df_npis.ID_County == + dummy_old = np.maximum(dummy_old, + df_npis_old + [(df_npis_old + [dd.EngEng['idCounty']] == 5566) & + (df_npis_old[dd.EngEng['npiCode']] == + 'M16_050')].values[subcode] + [start_npi_cols:]) + dummy_new = df_npis.loc[df_npis[dd.EngEng['idCounty']] == 5566, 'M16_050'].values print(abs(dummy_old-dummy_new).sum() == 32+2*20+3*22) #### end validation #### @@ -1073,7 +1115,7 @@ def main(): """! Main program entry.""" # arg_dict = gd.cli("testing") - transform_npi_data(fine_resolution=2, read_data=False, make_plot=True) + transform_npi_data(fine_resolution=1, read_data=False, make_plot=True) if __name__ == "__main__": From c25384f305c9c3c5ae2e6b489d05a81b32437d66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Fri, 22 Apr 2022 19:54:06 +0200 Subject: [PATCH 009/104] further validation and minor items --- .../memilio/epidata/transformNPIData.py | 314 ++++++++++++++---- 1 file changed, 243 insertions(+), 71 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index edcb584317..ce78f8cf28 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -17,7 +17,7 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################# -from datetime import datetime, timedelta +from datetime import datetime, timedelta, date import sys import time import os @@ -238,6 +238,8 @@ def transform_npi_data(fine_resolution=2, # defines delay in number of days between exceeding # incidence threshold and NPI getting active npi_activation_delay = 0 + print('Using a delay of NPI activation of ' + + str(npi_activation_delay) + ' days.') try: df_npis_old = pd.read_csv( @@ -466,19 +468,6 @@ def transform_npi_data(fine_resolution=2, df_npis_old.columns).index( dd.EngEng['npiCode']) + 1 - # create new data frame for all NPIs given in the columns, - # resolved by county and day - df_npis = pd.DataFrame( - columns=[dd.EngEng['date']] + [dd.EngEng['idCounty']] + - list(npis_final[dd.EngEng['npiCode']])) - # convert NPI data from object to int such that correlations can be - # computed - df_npis = df_npis.astype(dict( - zip( - [dd.EngEng['date']] + [dd.EngEng['idCounty']] + - list(npis_final[dd.EngEng['npiCode']]), ['str', 'int'] + - ['int' for i in npis_final[dd.EngEng['npiCode']]]))) - # store string dates 'dYYYYMMDD' in list before parsing str_dates = list(df_npis_old.iloc[:, start_npi_cols:].columns) # convert string dates into other format @@ -523,10 +512,24 @@ def transform_npi_data(fine_resolution=2, max_date + [max(dates_new), pd.to_datetime(end_date)]) + # create new data frame for all NPIs given in the columns, + # resolved by county and day + df_npis = pd.DataFrame( + columns=[dd.EngEng['date']] + [dd.EngEng['idCounty']] + + list(npis_final[dd.EngEng['npiCode']])) + # convert NPI data from object to int such that correlations can be + # computed + df_npis = df_npis.astype(dict( + zip( + [dd.EngEng['date']] + [dd.EngEng['idCounty']] + + list(npis_final[dd.EngEng['npiCode']]), ['str', 'int'] + + ['int' for i in npis_final[dd.EngEng['npiCode']]]))) + # iterate over countyIDs - counters = np.zeros(4) # time counter for output only + counters = np.zeros(5) # time counter for output only countyidx = 0 - for countyID in unique_geo_entities: + # unique_geo_entities: + for countyID in [5315, 1001, 9162, 16071, 11000, 1060, 5566]: cid = 0 countyidx += 1 @@ -540,6 +543,10 @@ def transform_npi_data(fine_resolution=2, periods=7).fillna(df_infec_local[dd.EngEng['confirmed']]) df_infec_local['Incidence'] = incidence_local / pop_local * 100000 + # set to main data frame + df_infec_rki.loc[df_infec_rki[dd.EngEng['idCounty']] == + countyID, 'Incidence'] = df_infec_local['Incidence'].values + # cut infection information at start_date_new and end_date_new df_infec_local = df_infec_local.loc[(df_infec_local[dd.EngEng['date']] >= start_date_new) & ( df_infec_local[dd.EngEng['date']] <= end_date_new), :].reset_index() @@ -571,6 +578,11 @@ def transform_npi_data(fine_resolution=2, columns=[dd.EngEng['date']] + [dd.EngEng['idCounty']] + list(npis[dd.EngEng['npiCode']])) + counters[cid] += time.perf_counter()-start_time + cid += 1 + + start_time = time.perf_counter() + # fill in NPI values by transposing from columns to rows df_local_new[dd.EngEng['date']] = dates_new df_local_new[dd.EngEng['idCounty']] = countyID @@ -605,6 +617,10 @@ def transform_npi_data(fine_resolution=2, # cut NPI information at start_date_new and end_date_new df_local_new = df_local_new.loc[(df_local_new[dd.EngEng['date']] >= start_date_new) & ( df_local_new[dd.EngEng['date']] <= end_date_new), :].reset_index() + try: + df_local_new = df_local_new.drop(columns='index') + except: + pass # get index of first NPI column in local data frame npis_idx_start = list( df_local_new.columns).index( @@ -649,28 +665,22 @@ def transform_npi_data(fine_resolution=2, ### ### start_time = time.perf_counter() - df_npis = df_npis.append(df_local_new.copy()) + df_npis = df_npis.append(df_local_new, + ignore_index=True).copy() counters[cid] += time.perf_counter()-start_time cid += 1 - # kita - # figcode = ["M03_0" + str(i)+str(j) for i in range(10,70,10) for j in [''] + ['_'+str(k) for k in range(1,6)]] - # # school - # figcode = ["M02a_0" + str(i)+str(j) for i in [10,20,30,31,32,33,34,35,36] for j in [''] + ['_'+str(k) for k in range(1,6)]] - # for bb in figcode: - # dates_mentioned = np.array(dates_new)[list(np.where(df_npis.loc[:,bb]>0)[0])] - # print('') - # customPlot.plotList(df_npis.loc[:,"Date"], [df_npis.loc[:,bb] for bb in figcode], legend=figcode, title='asd', xlabel='asd', ylabel='ad', fig_name='asd') - # divide working time by completed number of counties and multiply # by remaining number of counties to estimate time remaining time_remain = sum( counters) / countyidx * (len(unique_geo_entities) - countyidx) # print progress - print( - 'Progress ' + str(countyidx) + ' / ' + - str(len(unique_geo_entities)) + '. Estimated time remaining: ' + - str(int(time_remain / 60)) + ' min.') + if countyidx == 1 or countyidx % int( + len(unique_geo_entities) / 10) == 0: + print('Progress ' + str(countyidx) + ' / ' + + str(len(unique_geo_entities)) + + '. Estimated time remaining: ' + + str(int(time_remain / 60)) + ' min.') # print sub counters print('Sub task counters are: ') @@ -687,60 +697,222 @@ def transform_npi_data(fine_resolution=2, except: pass - print( - "Time needed: " + str(counters[0]) + ", " + str(counters[1]) + ", " + - str(counters[2]) + " sec") - #### start validation #### if fine_resolution == 2: - # Cologne for M01a_010 and all dates (no changes) - dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 5315) & ( - df_npis_old[dd.EngEng['npiCode']] == 'M01a_010')].values[0][start_npi_cols:] - dummy_new = df_npis.loc[df_npis[dd.EngEng['idCounty']] == - 5315, 'M01a_010'].values - print(abs(dummy_old-dummy_new).sum() == 0) + # the following validation is only valid for end_date May 14, 2021 + # and data frames of mentioned NPIs, not active NPIs + start_date_validation = datetime(2020, 3, 1) + end_date_validation = datetime(2022, 2, 15) + end_date_validation_str = 'd20220215' + + # Cologne for M01a_010 and all dates (no changes, 0 to 0) + dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 5315) & (df_npis_old[dd.EngEng['npiCode']] == 'M01a_010')].iloc[:, :list( + df_npis_old.columns).index(end_date_validation_str)+1].values[0][start_npi_cols:] + dummy_new = df_npis.loc[(df_npis[dd.EngEng['idCounty']] == 5315) & ( + df_npis[dd.EngEng['date']] <= end_date_validation), 'M01a_010'].values + valdiff = 0 + for i in range(2, 6): + valdiff += (i-1)*len(np.where(dummy_old == i)[0]) + valdiff += 99*len(np.where(dummy_old == -99)[0]) + print(abs(dummy_old-dummy_new).sum() == valdiff) + + # Cologne for M01a_150_2 and all dates (no changes, 0 to 0, 1 to 1) + dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 5315) & (df_npis_old[dd.EngEng['npiCode']] == 'M01a_150_2')].iloc[:, :list( + df_npis_old.columns).index(end_date_validation_str)+1].values[0][start_npi_cols:] + dummy_new = df_npis.loc[(df_npis[dd.EngEng['idCounty']] == 5315) & ( + df_npis[dd.EngEng['date']] <= end_date_validation), 'M01a_150_2'].values + incid = df_infec_rki[(df_infec_rki[dd.EngEng['idCounty']] == 5315) & + (df_infec_rki[dd.EngEng['date']] <= + end_date_validation) & + (df_infec_rki[dd.EngEng['date']] >= + start_date_validation)]['Incidence'].values + npi_index = np.where(dummy_old >= 1)[0] + incid_index = np.where(incid >= 10)[0] + active_index = np.sort( + list(set(npi_index).intersection(incid_index))) + nonactive_index = np.sort( + list(set(npi_index).difference(active_index))) + valdiff = 0 + for i in range(2, 6): + # these values >=1 are set to 1 + valdiff += (i-1)*len( + np.where(dummy_old[list(active_index)] == i)[0]) + # these values >=1 are set to 0 + valdiff += i * len( + np.where(dummy_old[list(nonactive_index)] == i)[0]) + # -99 always set to 0 + valdiff += 99*len(np.where(dummy_old == -99)[0]) + print(abs(dummy_old-dummy_new).sum() == valdiff) # Flensburg for M05_120 and all dates ('2's become '1's) - dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 1001) & ( - df_npis_old[dd.EngEng['npiCode']] == 'M05_120')].values[0][start_npi_cols:] - dummy_new = df_npis.loc[df_npis[dd.EngEng['idCounty']] == - 1001, 'M05_120'].values - print(abs(dummy_old-dummy_new).sum() == 5) + dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 1001) & (df_npis_old[dd.EngEng['npiCode']] == 'M05_120')].iloc[:, :list( + df_npis_old.columns).index(end_date_validation_str)+1].values[0][start_npi_cols:] + dummy_new = df_npis.loc[(df_npis[dd.EngEng['idCounty']] == 1001) & ( + df_npis[dd.EngEng['date']] <= end_date_validation), 'M05_120'].values + incid = df_infec_rki[(df_infec_rki[dd.EngEng['idCounty']] == 1001) & + (df_infec_rki[dd.EngEng['date']] <= + end_date_validation) & + (df_infec_rki[dd.EngEng['date']] >= + start_date_validation)]['Incidence'].values + npi_index = np.where(dummy_old >= 1)[0] + incid_index = np.where(incid >= 10)[0] + active_index = np.sort( + list(set(npi_index).intersection(incid_index))) + nonactive_index = np.sort( + list(set(npi_index).difference(active_index))) + valdiff = 0 + for i in range(2, 6): + # these values >=1 are set to 1 + valdiff += (i-1)*len( + np.where(dummy_old[list(active_index)] == i)[0]) + # these values >=1 are set to 0 + valdiff += i * len( + np.where(dummy_old[list(nonactive_index)] == i)[0]) + # -99 always set to 0 + valdiff += 99*len(np.where(dummy_old == -99)[0]) + print(abs(dummy_old-dummy_new).sum() == valdiff) # Munich for M01a_010_4 and all dates (-99 becomes 0, 0 stays 0) - dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 9162) & ( - df_npis_old[dd.EngEng['npiCode']] == 'M01a_010_4')].values[0][start_npi_cols:] - dummy_new = df_npis.loc[df_npis[dd.EngEng['idCounty']] == - 9162, 'M01a_010_4'].values - print(abs(dummy_old-dummy_new).sum() == 422*99) + dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 9162) & (df_npis_old[dd.EngEng['npiCode']] == 'M01a_010_4')].iloc[:, :list( + df_npis_old.columns).index(end_date_validation_str)+1].values[0][start_npi_cols:] + dummy_new = df_npis.loc[(df_npis[dd.EngEng['idCounty']] == 9162) & ( + df_npis[dd.EngEng['date']] <= end_date_validation), 'M01a_010_4'].values + incid = df_infec_rki[(df_infec_rki[dd.EngEng['idCounty']] == 9162) & + (df_infec_rki[dd.EngEng['date']] <= + end_date_validation) & + (df_infec_rki[dd.EngEng['date']] >= + start_date_validation)]['Incidence'].values + npi_index = np.where(dummy_old >= 1)[0] + incid_index = np.where(incid >= 10)[0] + active_index = np.sort( + list(set(npi_index).intersection(incid_index))) + nonactive_index = np.sort( + list(set(npi_index).difference(active_index))) + valdiff = 0 + for i in range(2, 6): + # these values >=1 are set to 1 + valdiff += (i-1)*len( + np.where(dummy_old[list(active_index)] == i)[0]) + # these values >=1 are set to 0 + valdiff += i * len( + np.where(dummy_old[list(nonactive_index)] == i)[0]) + # -99 always set to 0 + valdiff += 99*len(np.where(dummy_old == -99)[0]) + print(abs(dummy_old-dummy_new).sum() == valdiff) # Weimar for M12_030_3 and all dates (-99 becomes 0, 1 stays 1, 0 stays 0) - dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 16071) & ( - df_npis_old[dd.EngEng['npiCode']] == 'M12_030_3')].values[0][start_npi_cols:] - dummy_new = df_npis.loc[df_npis[dd.EngEng['idCounty']] == - 16071, 'M12_030_3'].values - print(abs(dummy_old-dummy_new).sum() == 422*99) + dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 16071) & (df_npis_old[dd.EngEng['npiCode']] == 'M18_030_4')].iloc[:, :list( + df_npis_old.columns).index(end_date_validation_str)+1].values[0][start_npi_cols:] + dummy_new = df_npis.loc[(df_npis[dd.EngEng['idCounty']] == 16071) & ( + df_npis[dd.EngEng['date']] <= end_date_validation), 'M18_030_4'].values + incid = df_infec_rki[(df_infec_rki[dd.EngEng['idCounty']] == 16071) & + (df_infec_rki[dd.EngEng['date']] <= + end_date_validation) & + (df_infec_rki[dd.EngEng['date']] >= + start_date_validation)]['Incidence'].values + npi_index = np.where(dummy_old >= 1)[0] + incid_index = np.where(incid >= 50)[0] + active_index = np.sort( + list(set(npi_index).intersection(incid_index))) + nonactive_index = np.sort( + list(set(npi_index).difference(active_index))) + valdiff = 0 + for i in range(2, 6): + # these values >=1 are set to 1 + valdiff += (i-1)*len( + np.where(dummy_old[list(active_index)] == i)[0]) + # these values >=1 are set to 0 + valdiff += i * len( + np.where(dummy_old[list(nonactive_index)] == i)[0]) + # -99 always set to 0 + valdiff += 99*len(np.where(dummy_old == -99)[0]) + print(abs(dummy_old-dummy_new).sum() == valdiff) # TODO # Berlin for M01b_020 and all dates (2 becomes 1, 1 stays 1, 0 stays 0) - dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 11000) & ( - df_npis_old[dd.EngEng['npiCode']] == 'M01b_020')].values[0][start_npi_cols:] - dummy_new = df_npis.loc[df_npis[dd.EngEng['idCounty']] == - 11000, 'M01b_020'].values - print(abs(dummy_old-dummy_new).sum() == 82) + dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 11000) & (df_npis_old[dd.EngEng['npiCode']] == 'M01b_020')].iloc[:, :list( + df_npis_old.columns).index(end_date_validation_str)+1].values[0][start_npi_cols:] + dummy_new = df_npis.loc[(df_npis[dd.EngEng['idCounty']] == 11000) & ( + df_npis[dd.EngEng['date']] <= end_date_validation), 'M01b_020'].values + incid = df_infec_rki[(df_infec_rki[dd.EngEng['idCounty']] == 11000) & + (df_infec_rki[dd.EngEng['date']] <= + end_date_validation) & + (df_infec_rki[dd.EngEng['date']] >= + start_date_validation)]['Incidence'].values + npi_index = np.where(dummy_old >= 1)[0] + incid_index = np.where(incid >= 10)[0] + active_index = np.sort( + list(set(npi_index).intersection(incid_index))) + nonactive_index = np.sort( + list(set(npi_index).difference(active_index))) + valdiff = 0 + for i in range(2, 6): + # these values >=1 are set to 1 + valdiff += (i-1)*len( + np.where(dummy_old[list(active_index)] == i)[0]) + # these values >=1 are set to 0 + valdiff += i * len( + np.where(dummy_old[list(nonactive_index)] == i)[0]) + # -99 always set to 0 + valdiff += 99*len(np.where(dummy_old == -99)[0]) + print(abs(dummy_old-dummy_new).sum() == valdiff) # TODO # Segeberg for M02b_035 and all dates (2 -> 1, 3 -> 1, 5 -> 1) - dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 1060) & ( - df_npis_old[dd.EngEng['npiCode']] == 'M02b_035')].values[0][start_npi_cols:] - dummy_new = df_npis.loc[df_npis[dd.EngEng['idCounty']] == - 1060, 'M02b_035'].values - print(abs(dummy_old-dummy_new).sum() == 151+2*53+4*22) + dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 1060) & (df_npis_old[dd.EngEng['npiCode']] == 'M02b_035')].iloc[:, :list( + df_npis_old.columns).index(end_date_validation_str)+1].values[0][start_npi_cols:] + dummy_new = df_npis.loc[(df_npis[dd.EngEng['idCounty']] == 1060) & ( + df_npis[dd.EngEng['date']] <= end_date_validation), 'M02b_035'].values + incid = df_infec_rki[(df_infec_rki[dd.EngEng['idCounty']] == 1060) & + (df_infec_rki[dd.EngEng['date']] <= + end_date_validation) & + (df_infec_rki[dd.EngEng['date']] >= + start_date_validation)]['Incidence'].values + npi_index = np.where(dummy_old >= 1)[0] + incid_index = np.where(incid >= 10)[0] + active_index = np.sort( + list(set(npi_index).intersection(incid_index))) + nonactive_index = np.sort( + list(set(npi_index).difference(active_index))) + valdiff = 0 + for i in range(2, 6): + # these values >=1 are set to 1 + valdiff += (i-1)*len( + np.where(dummy_old[list(active_index)] == i)[0]) + # these values >=1 are set to 0 + valdiff += i * len( + np.where(dummy_old[list(nonactive_index)] == i)[0]) + # -99 always set to 0 + valdiff += 99*len(np.where(dummy_old == -99)[0]) + print(abs(dummy_old-dummy_new).sum() == valdiff) # TODO # Steinfurt for M16_050 and all dates (4 -> 1, ...) - dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 5566) & ( - df_npis_old[dd.EngEng['npiCode']] == 'M16_050')].values[0][start_npi_cols:] - dummy_new = df_npis.loc[df_npis[dd.EngEng['idCounty']] == - 5566, 'M16_050'].values - print(abs(dummy_old-dummy_new).sum() == 32+2*20+3*22) + dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 5566) & (df_npis_old[dd.EngEng['npiCode']] == 'M16_050')].iloc[:, :list( + df_npis_old.columns).index(end_date_validation_str)+1].values[0][start_npi_cols:] + dummy_new = df_npis.loc[(df_npis[dd.EngEng['idCounty']] == 5566) & ( + df_npis[dd.EngEng['date']] <= end_date_validation), 'M16_050'].values + incid = df_infec_rki[(df_infec_rki[dd.EngEng['idCounty']] == 5566) & + (df_infec_rki[dd.EngEng['date']] <= + end_date_validation) & + (df_infec_rki[dd.EngEng['date']] >= + start_date_validation)]['Incidence'].values + npi_index = np.where(dummy_old >= 1)[0] + incid_index = np.where(incid >= 10)[0] + active_index = np.sort( + list(set(npi_index).intersection(incid_index))) + nonactive_index = np.sort( + list(set(npi_index).difference(active_index))) + valdiff = 0 + for i in range(2, 6): + # these values >=1 are set to 1 + valdiff += (i-1)*len( + np.where(dummy_old[list(active_index)] == i)[0]) + # these values >=1 are set to 0 + valdiff += i * len( + np.where(dummy_old[list(nonactive_index)] == i)[0]) + # -99 always set to 0 + valdiff += 99*len(np.where(dummy_old == -99)[0]) + print(abs(dummy_old-dummy_new).sum() == valdiff) + + x = 15 elif fine_resolution == 1: # replace more detailed code names X_Y with major code X df_npis_old[dd.EngEng['npiCode']] = df_npis_old[dd.EngEng[ @@ -1115,7 +1287,7 @@ def main(): """! Main program entry.""" # arg_dict = gd.cli("testing") - transform_npi_data(fine_resolution=1, read_data=False, make_plot=True) + transform_npi_data(fine_resolution=2, read_data=False, make_plot=True) if __name__ == "__main__": From a6362212b93f1c23a0faf5fecaa4c4f6d0ac042e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Tue, 7 Jun 2022 16:27:30 +0200 Subject: [PATCH 010/104] outsourcing of validation to function --- .../memilio/epidata/transformNPIData.py | 289 ++++-------------- 1 file changed, 62 insertions(+), 227 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index ce78f8cf28..2904e96814 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -36,13 +36,13 @@ def evaluate_clustering(corr_mat, idx_to_cluster_idx, indices_all): - """! Computes a score for a particular clustering based on the - correlation matrix. The score is computed as the percentage of 'higher' - or 'high' values (e.g., between 0.5 and 0.75 or 0.75 and 1) of the - correlation matrix that are to be found in the diagonal blocks of the + """! Computes a score for a particular clustering based on the + correlation matrix. The score is computed as the percentage of 'higher' + or 'high' values (e.g., between 0.5 and 0.75 or 0.75 and 1) of the + correlation matrix that are to be found in the diagonal blocks of the clustered correlation matrix vs these values in the offdiagonal blocks. - @param corr_mat correlation matrix between the features / data set items + @param corr_mat correlation matrix between the features / data set items that were clustered. @param idx_to_cluster_idx Mapping of data item to cluster index. @param indices_all List of indices of all data items. @@ -98,15 +98,15 @@ def compute_hierarch_clustering(corr_mat, corr_pairwdist, metrics=['single', 'complete', 'average', 'weighted', 'centroid', 'median', 'ward']): - """! Computes a hierarchical clustering for a (list of) metric(s) and - provides the maximum cophenetic distance(s) as well as a score for the + """! Computes a hierarchical clustering for a (list of) metric(s) and + provides the maximum cophenetic distance(s) as well as a score for the clustering (see @method evaluate_clustering(...)). - @param corr_mat correlation matrix between the features / data set items + @param corr_mat correlation matrix between the features / data set items to be clustered hierarchically. @param corr_pairwdist Computed pairwise distance between the features / data set items. - @param metric Metric or list of metrics to compute the hierarchical + @param metric Metric or list of metrics to compute the hierarchical clustering. @return (List of) hierarchical clustering(s), maximum cophenetic distance(s) @@ -144,15 +144,15 @@ def compute_hierarch_clustering(corr_mat, corr_pairwdist, def flatten_hierarch_clustering(corr_mat, cluster_hierarch, weights): - """! Flattens a hierarchical clustering for a (list of) maximum cophenetic + """! Flattens a hierarchical clustering for a (list of) maximum cophenetic distance(s) in the flat clusters and evaluates the resulting clustering with respect to the corresponding correlation matrix. - @param corr_mat correlation matrix between the features / data set items + @param corr_mat correlation matrix between the features / data set items clustered hierarchically. - @param cluster_hierarch hierarchical clustering of given features / data + @param cluster_hierarch hierarchical clustering of given features / data set items. - @param weigths Maximum cophenetic distance or list of maximum cophenetic + @param weigths Maximum cophenetic distance or list of maximum cophenetic distances to compute the flat clustering(s). @return flat clustering(s) according to the (list of) maximum distance(s). @@ -179,6 +179,37 @@ def flatten_hierarch_clustering(corr_mat, cluster_hierarch, weights): return npi_idx_to_cluster_idx_list +def validate(df_npis_old, df_npis, df_infec_rki, countyID, npiCode, + start_npi_cols, npi_incid_start, start_date_validation, + end_date_validation): + dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == countyID) & (df_npis_old[dd.EngEng['npiCode']] == npiCode) + ].iloc[:, :list(df_npis_old.columns).index(end_date_validation.strftime('d%Y%m%d'))+1].values[0][start_npi_cols:] + dummy_new = df_npis.loc[(df_npis[dd.EngEng['idCounty']] == countyID) & ( + df_npis[dd.EngEng['date']] <= end_date_validation), npiCode].values + incid = df_infec_rki[(df_infec_rki[dd.EngEng['idCounty']] == countyID) & + (df_infec_rki[dd.EngEng['date']] <= + end_date_validation) & + (df_infec_rki[dd.EngEng['date']] >= + start_date_validation)]['Incidence'].values + npi_index = np.where(dummy_old >= 1)[0] + incid_index = np.where(incid >= npi_incid_start[npiCode])[0] + active_index = np.sort( + list(set(npi_index).intersection(incid_index))) + nonactive_index = np.sort( + list(set(npi_index).difference(active_index))) + valdiff = 0 + for i in range(1, 6): + # these values >=1 are set to 1 + valdiff += (i-1)*len( + np.where(dummy_old[list(active_index)] == i)[0]) + # these values >=1 are set to 0 + valdiff += i * len( + np.where(dummy_old[list(nonactive_index)] == i)[0]) + # -99 always set to 0 + valdiff += 99*len(np.where(dummy_old == -99)[0]) + return [abs(dummy_old-dummy_new).sum(), valdiff, dummy_old, dummy_new] + + def print_manual_download(filename, url): print( @@ -196,8 +227,8 @@ def transform_npi_data(fine_resolution=2, make_plot=dd.defaultDict['make_plot'], ): """! Loads a certain resolution of recorded NPI data from - the Corona Datenplattform and transforms it according to the - arguments given. + the Corona Datenplattform and transforms it according to the + arguments given. For full functionality, please manually download - kr_massnahmen_unterkategorien.csv @@ -209,14 +240,14 @@ def transform_npi_data(fine_resolution=2, and move it to the *directory*-path mentioned in the beginning of the function. @param fine_resolution 2 [Default] or 0 or 1. Defines which categories - are considered. - If '2' is set, all the subcategories (~1200) are considered. - If '1' is set, all incidence levels of subcategories are merged and + are considered. + If '2' is set, all the subcategories (~1200) are considered. + If '1' is set, all incidence levels of subcategories are merged and ~200 NPIs are considered. If '0' is chosen only the main, summarizing categories (~20) are used. - @param file_format File format which is used for writing the data. + @param file_format File format which is used for writing the data. Default defined in defaultDict. - @param out_folder Path to folder where data is written in folder + @param out_folder Path to folder where data is written in folder out_folder/Germany. @param start_date [Default = '', taken from read data] Start date of stored data frames. @@ -703,214 +734,18 @@ def transform_npi_data(fine_resolution=2, # and data frames of mentioned NPIs, not active NPIs start_date_validation = datetime(2020, 3, 1) end_date_validation = datetime(2022, 2, 15) - end_date_validation_str = 'd20220215' - - # Cologne for M01a_010 and all dates (no changes, 0 to 0) - dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 5315) & (df_npis_old[dd.EngEng['npiCode']] == 'M01a_010')].iloc[:, :list( - df_npis_old.columns).index(end_date_validation_str)+1].values[0][start_npi_cols:] - dummy_new = df_npis.loc[(df_npis[dd.EngEng['idCounty']] == 5315) & ( - df_npis[dd.EngEng['date']] <= end_date_validation), 'M01a_010'].values - valdiff = 0 - for i in range(2, 6): - valdiff += (i-1)*len(np.where(dummy_old == i)[0]) - valdiff += 99*len(np.where(dummy_old == -99)[0]) - print(abs(dummy_old-dummy_new).sum() == valdiff) - - # Cologne for M01a_150_2 and all dates (no changes, 0 to 0, 1 to 1) - dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 5315) & (df_npis_old[dd.EngEng['npiCode']] == 'M01a_150_2')].iloc[:, :list( - df_npis_old.columns).index(end_date_validation_str)+1].values[0][start_npi_cols:] - dummy_new = df_npis.loc[(df_npis[dd.EngEng['idCounty']] == 5315) & ( - df_npis[dd.EngEng['date']] <= end_date_validation), 'M01a_150_2'].values - incid = df_infec_rki[(df_infec_rki[dd.EngEng['idCounty']] == 5315) & - (df_infec_rki[dd.EngEng['date']] <= - end_date_validation) & - (df_infec_rki[dd.EngEng['date']] >= - start_date_validation)]['Incidence'].values - npi_index = np.where(dummy_old >= 1)[0] - incid_index = np.where(incid >= 10)[0] - active_index = np.sort( - list(set(npi_index).intersection(incid_index))) - nonactive_index = np.sort( - list(set(npi_index).difference(active_index))) - valdiff = 0 - for i in range(2, 6): - # these values >=1 are set to 1 - valdiff += (i-1)*len( - np.where(dummy_old[list(active_index)] == i)[0]) - # these values >=1 are set to 0 - valdiff += i * len( - np.where(dummy_old[list(nonactive_index)] == i)[0]) - # -99 always set to 0 - valdiff += 99*len(np.where(dummy_old == -99)[0]) - print(abs(dummy_old-dummy_new).sum() == valdiff) - - # Flensburg for M05_120 and all dates ('2's become '1's) - dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 1001) & (df_npis_old[dd.EngEng['npiCode']] == 'M05_120')].iloc[:, :list( - df_npis_old.columns).index(end_date_validation_str)+1].values[0][start_npi_cols:] - dummy_new = df_npis.loc[(df_npis[dd.EngEng['idCounty']] == 1001) & ( - df_npis[dd.EngEng['date']] <= end_date_validation), 'M05_120'].values - incid = df_infec_rki[(df_infec_rki[dd.EngEng['idCounty']] == 1001) & - (df_infec_rki[dd.EngEng['date']] <= - end_date_validation) & - (df_infec_rki[dd.EngEng['date']] >= - start_date_validation)]['Incidence'].values - npi_index = np.where(dummy_old >= 1)[0] - incid_index = np.where(incid >= 10)[0] - active_index = np.sort( - list(set(npi_index).intersection(incid_index))) - nonactive_index = np.sort( - list(set(npi_index).difference(active_index))) - valdiff = 0 - for i in range(2, 6): - # these values >=1 are set to 1 - valdiff += (i-1)*len( - np.where(dummy_old[list(active_index)] == i)[0]) - # these values >=1 are set to 0 - valdiff += i * len( - np.where(dummy_old[list(nonactive_index)] == i)[0]) - # -99 always set to 0 - valdiff += 99*len(np.where(dummy_old == -99)[0]) - print(abs(dummy_old-dummy_new).sum() == valdiff) - - # Munich for M01a_010_4 and all dates (-99 becomes 0, 0 stays 0) - dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 9162) & (df_npis_old[dd.EngEng['npiCode']] == 'M01a_010_4')].iloc[:, :list( - df_npis_old.columns).index(end_date_validation_str)+1].values[0][start_npi_cols:] - dummy_new = df_npis.loc[(df_npis[dd.EngEng['idCounty']] == 9162) & ( - df_npis[dd.EngEng['date']] <= end_date_validation), 'M01a_010_4'].values - incid = df_infec_rki[(df_infec_rki[dd.EngEng['idCounty']] == 9162) & - (df_infec_rki[dd.EngEng['date']] <= - end_date_validation) & - (df_infec_rki[dd.EngEng['date']] >= - start_date_validation)]['Incidence'].values - npi_index = np.where(dummy_old >= 1)[0] - incid_index = np.where(incid >= 10)[0] - active_index = np.sort( - list(set(npi_index).intersection(incid_index))) - nonactive_index = np.sort( - list(set(npi_index).difference(active_index))) - valdiff = 0 - for i in range(2, 6): - # these values >=1 are set to 1 - valdiff += (i-1)*len( - np.where(dummy_old[list(active_index)] == i)[0]) - # these values >=1 are set to 0 - valdiff += i * len( - np.where(dummy_old[list(nonactive_index)] == i)[0]) - # -99 always set to 0 - valdiff += 99*len(np.where(dummy_old == -99)[0]) - print(abs(dummy_old-dummy_new).sum() == valdiff) - - # Weimar for M12_030_3 and all dates (-99 becomes 0, 1 stays 1, 0 stays 0) - dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 16071) & (df_npis_old[dd.EngEng['npiCode']] == 'M18_030_4')].iloc[:, :list( - df_npis_old.columns).index(end_date_validation_str)+1].values[0][start_npi_cols:] - dummy_new = df_npis.loc[(df_npis[dd.EngEng['idCounty']] == 16071) & ( - df_npis[dd.EngEng['date']] <= end_date_validation), 'M18_030_4'].values - incid = df_infec_rki[(df_infec_rki[dd.EngEng['idCounty']] == 16071) & - (df_infec_rki[dd.EngEng['date']] <= - end_date_validation) & - (df_infec_rki[dd.EngEng['date']] >= - start_date_validation)]['Incidence'].values - npi_index = np.where(dummy_old >= 1)[0] - incid_index = np.where(incid >= 50)[0] - active_index = np.sort( - list(set(npi_index).intersection(incid_index))) - nonactive_index = np.sort( - list(set(npi_index).difference(active_index))) - valdiff = 0 - for i in range(2, 6): - # these values >=1 are set to 1 - valdiff += (i-1)*len( - np.where(dummy_old[list(active_index)] == i)[0]) - # these values >=1 are set to 0 - valdiff += i * len( - np.where(dummy_old[list(nonactive_index)] == i)[0]) - # -99 always set to 0 - valdiff += 99*len(np.where(dummy_old == -99)[0]) - print(abs(dummy_old-dummy_new).sum() == valdiff) # TODO - # Berlin for M01b_020 and all dates (2 becomes 1, 1 stays 1, 0 stays 0) - dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 11000) & (df_npis_old[dd.EngEng['npiCode']] == 'M01b_020')].iloc[:, :list( - df_npis_old.columns).index(end_date_validation_str)+1].values[0][start_npi_cols:] - dummy_new = df_npis.loc[(df_npis[dd.EngEng['idCounty']] == 11000) & ( - df_npis[dd.EngEng['date']] <= end_date_validation), 'M01b_020'].values - incid = df_infec_rki[(df_infec_rki[dd.EngEng['idCounty']] == 11000) & - (df_infec_rki[dd.EngEng['date']] <= - end_date_validation) & - (df_infec_rki[dd.EngEng['date']] >= - start_date_validation)]['Incidence'].values - npi_index = np.where(dummy_old >= 1)[0] - incid_index = np.where(incid >= 10)[0] - active_index = np.sort( - list(set(npi_index).intersection(incid_index))) - nonactive_index = np.sort( - list(set(npi_index).difference(active_index))) - valdiff = 0 - for i in range(2, 6): - # these values >=1 are set to 1 - valdiff += (i-1)*len( - np.where(dummy_old[list(active_index)] == i)[0]) - # these values >=1 are set to 0 - valdiff += i * len( - np.where(dummy_old[list(nonactive_index)] == i)[0]) - # -99 always set to 0 - valdiff += 99*len(np.where(dummy_old == -99)[0]) - print(abs(dummy_old-dummy_new).sum() == valdiff) # TODO - - # Segeberg for M02b_035 and all dates (2 -> 1, 3 -> 1, 5 -> 1) - dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 1060) & (df_npis_old[dd.EngEng['npiCode']] == 'M02b_035')].iloc[:, :list( - df_npis_old.columns).index(end_date_validation_str)+1].values[0][start_npi_cols:] - dummy_new = df_npis.loc[(df_npis[dd.EngEng['idCounty']] == 1060) & ( - df_npis[dd.EngEng['date']] <= end_date_validation), 'M02b_035'].values - incid = df_infec_rki[(df_infec_rki[dd.EngEng['idCounty']] == 1060) & - (df_infec_rki[dd.EngEng['date']] <= - end_date_validation) & - (df_infec_rki[dd.EngEng['date']] >= - start_date_validation)]['Incidence'].values - npi_index = np.where(dummy_old >= 1)[0] - incid_index = np.where(incid >= 10)[0] - active_index = np.sort( - list(set(npi_index).intersection(incid_index))) - nonactive_index = np.sort( - list(set(npi_index).difference(active_index))) - valdiff = 0 - for i in range(2, 6): - # these values >=1 are set to 1 - valdiff += (i-1)*len( - np.where(dummy_old[list(active_index)] == i)[0]) - # these values >=1 are set to 0 - valdiff += i * len( - np.where(dummy_old[list(nonactive_index)] == i)[0]) - # -99 always set to 0 - valdiff += 99*len(np.where(dummy_old == -99)[0]) - print(abs(dummy_old-dummy_new).sum() == valdiff) # TODO - - # Steinfurt for M16_050 and all dates (4 -> 1, ...) - dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 5566) & (df_npis_old[dd.EngEng['npiCode']] == 'M16_050')].iloc[:, :list( - df_npis_old.columns).index(end_date_validation_str)+1].values[0][start_npi_cols:] - dummy_new = df_npis.loc[(df_npis[dd.EngEng['idCounty']] == 5566) & ( - df_npis[dd.EngEng['date']] <= end_date_validation), 'M16_050'].values - incid = df_infec_rki[(df_infec_rki[dd.EngEng['idCounty']] == 5566) & - (df_infec_rki[dd.EngEng['date']] <= - end_date_validation) & - (df_infec_rki[dd.EngEng['date']] >= - start_date_validation)]['Incidence'].values - npi_index = np.where(dummy_old >= 1)[0] - incid_index = np.where(incid >= 10)[0] - active_index = np.sort( - list(set(npi_index).intersection(incid_index))) - nonactive_index = np.sort( - list(set(npi_index).difference(active_index))) - valdiff = 0 - for i in range(2, 6): - # these values >=1 are set to 1 - valdiff += (i-1)*len( - np.where(dummy_old[list(active_index)] == i)[0]) - # these values >=1 are set to 0 - valdiff += i * len( - np.where(dummy_old[list(nonactive_index)] == i)[0]) - # -99 always set to 0 - valdiff += 99*len(np.where(dummy_old == -99)[0]) - print(abs(dummy_old-dummy_new).sum() == valdiff) + for countyID in [5315, 1001, 9162, 16071, 11000, 1060, 5566]: + for npiCode in [ + 'M01a_010', 'M01a_150', 'M05_120', 'M01a_010', + 'M18_030', 'M01b_020', 'M02b_035', 'M16_050']: + for subcode in [''] + ['_'+str(i) for i in range(1, 6)]: + [a, b, c, d] = validate(df_npis_old, df_npis, + df_infec_rki, countyID, npiCode + subcode, + start_npi_cols, npi_incid_start, + start_date_validation, end_date_validation) + if(a != b): + print('Error in NPI activation computation') x = 15 elif fine_resolution == 1: From 09dc29120a1c4e068d06d59cc7943d365899c709 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Wed, 8 Jun 2022 20:45:47 +0200 Subject: [PATCH 011/104] Extension and correction for incidence-independent cats; WIP --- .../memilio/epidata/transformNPIData.py | 83 ++++++++++++++----- 1 file changed, 64 insertions(+), 19 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index 2904e96814..270b7665fa 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -181,22 +181,56 @@ def flatten_hierarch_clustering(corr_mat, cluster_hierarch, weights): def validate(df_npis_old, df_npis, df_infec_rki, countyID, npiCode, start_npi_cols, npi_incid_start, start_date_validation, - end_date_validation): - dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == countyID) & (df_npis_old[dd.EngEng['npiCode']] == npiCode) - ].iloc[:, :list(df_npis_old.columns).index(end_date_validation.strftime('d%Y%m%d'))+1].values[0][start_npi_cols:] + end_date_validation, fine_resolution): + if npiCode == 'M01a_150': + x = 15 # TODO + + if fine_resolution == 1: + npiCodes = [npiCode + code + for code in [''] + ['_' + str(i) for i in range(1, 6)]] + else: + npiCodes = [npiCode] + for npiCode in npiCodes: + dummy_old_rows = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == countyID) & (df_npis_old[dd.EngEng['npiCode']].isin( + npiCodes))].iloc[:, :list(df_npis_old.columns).index(end_date_validation.strftime('d%Y%m%d'))+1] + dummy_old = np.zeros( + (dummy_old_rows.shape[1]-start_npi_cols, dummy_old_rows.shape[0])) + for i in range(dummy_old.shape[1]): + dummy_old[:, i] = dummy_old_rows.values[i][start_npi_cols:] + dummy_new = df_npis.loc[(df_npis[dd.EngEng['idCounty']] == countyID) & ( - df_npis[dd.EngEng['date']] <= end_date_validation), npiCode].values + df_npis[dd.EngEng['date']] <= end_date_validation), npiCodes[0]].values incid = df_infec_rki[(df_infec_rki[dd.EngEng['idCounty']] == countyID) & (df_infec_rki[dd.EngEng['date']] <= end_date_validation) & (df_infec_rki[dd.EngEng['date']] >= start_date_validation)]['Incidence'].values - npi_index = np.where(dummy_old >= 1)[0] - incid_index = np.where(incid >= npi_incid_start[npiCode])[0] + + if fine_resolution == 1: + for col in range(dummy_old.shape[1]): + npi_index = np.where(dummy_old[:, col] >= 1)[0] + incid_index = np.where(incid >= npi_incid_start[npiCodes[col]])[0] + active_index = np.sort( + list(set(npi_index).intersection(incid_index))) + nonactive_index = np.sort( + list(set(npi_index).difference(active_index))) + + # deactivate values based on incidence before taking + # the maximum over NPI group + dummy_old[list(nonactive_index)] = 0 + + # the intermediate step is necessary since the array has a 2nd dimension + # of size zero if we directly set dummy_old = dummy_old.max(axis=1) + dummy_old[:, 0] = dummy_old.max(axis=1) + dummy_old = dummy_old[:, 0:1] + + npi_index = np.where(dummy_old[:, 0] >= 1)[0] + incid_index = np.where(incid >= npi_incid_start[npiCodes[0]])[0] active_index = np.sort( list(set(npi_index).intersection(incid_index))) nonactive_index = np.sort( list(set(npi_index).difference(active_index))) + valdiff = 0 for i in range(1, 6): # these values >=1 are set to 1 @@ -207,7 +241,7 @@ def validate(df_npis_old, df_npis, df_infec_rki, countyID, npiCode, np.where(dummy_old[list(nonactive_index)] == i)[0]) # -99 always set to 0 valdiff += 99*len(np.where(dummy_old == -99)[0]) - return [abs(dummy_old-dummy_new).sum(), valdiff, dummy_old, dummy_new] + return [abs(dummy_old[:, 0]-dummy_new).sum(), valdiff, dummy_old, dummy_new] def print_manual_download(filename, url): @@ -431,11 +465,9 @@ def transform_npi_data(fine_resolution=2, # create hash table from parental or main code/main category # to list of subcodes/subcategories maincode_to_npicodes_map = dict() - npicodes_to_maincode_map = dict() major_code = npi_codes[0] maincode_to_npicodes_map[major_code] = [] for code in npi_codes: - npicodes_to_maincode_map[code] = major_code if major_code in code: maincode_to_npicodes_map[major_code].append(code) else: @@ -493,7 +525,7 @@ def transform_npi_data(fine_resolution=2, sys.exit('Error. Other counties than that of Eisenach were removed.') # remove rows for Eisenach df_npis_old = df_npis_old[df_npis_old[dd.EngEng['idCounty']].isin( - unique_geo_entities)] + unique_geo_entities)].reset_index(drop=True).drop(columns='index') start_npi_cols = list( df_npis_old.columns).index( @@ -740,18 +772,31 @@ def transform_npi_data(fine_resolution=2, 'M01a_010', 'M01a_150', 'M05_120', 'M01a_010', 'M18_030', 'M01b_020', 'M02b_035', 'M16_050']: for subcode in [''] + ['_'+str(i) for i in range(1, 6)]: - [a, b, c, d] = validate(df_npis_old, df_npis, - df_infec_rki, countyID, npiCode + subcode, - start_npi_cols, npi_incid_start, - start_date_validation, end_date_validation) + [ + a, b, oldf, newf] = validate( + df_npis_old, df_npis, df_infec_rki, countyID, + npiCode + subcode, start_npi_cols, npi_incid_start, + start_date_validation, end_date_validation, + fine_resolution) if(a != b): print('Error in NPI activation computation') - x = 15 elif fine_resolution == 1: - # replace more detailed code names X_Y with major code X - df_npis_old[dd.EngEng['npiCode']] = df_npis_old[dd.EngEng[ - 'npiCode']].replace(npicodes_to_maincode_map) + start_date_validation = datetime(2020, 3, 1) + end_date_validation = datetime(2022, 2, 15) + + for countyID in [5315, 1001, 9162, 16071, 11000, 1060, 5566]: + for npiCode in [ + 'M01a_010', 'M01a_150', 'M05_120', 'M01a_010', + 'M18_030', 'M01b_020', 'M02b_035', 'M16_050']: + [a, b, oldf, newf] = validate(df_npis_old, df_npis, + df_infec_rki, countyID, npiCode, start_npi_cols, + npi_incid_start, start_date_validation, + end_date_validation, fine_resolution) + if(a != b): + print('Error in NPI activation computation') + else: + print(a == b, a, b) # Cologne for M01a_010 and all dates (no changes) dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 5315) & ( @@ -1122,7 +1167,7 @@ def main(): """! Main program entry.""" # arg_dict = gd.cli("testing") - transform_npi_data(fine_resolution=2, read_data=False, make_plot=True) + transform_npi_data(fine_resolution=1, read_data=False, make_plot=True) if __name__ == "__main__": From a2122a1ef45d9d094c182be89647d0956b8d39f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Thu, 9 Jun 2022 09:41:44 +0200 Subject: [PATCH 012/104] Validation for fine=1 completed --- .../memilio/epidata/transformNPIData.py | 91 +------------------ 1 file changed, 1 insertion(+), 90 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index 270b7665fa..38328fbdab 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -182,8 +182,6 @@ def flatten_hierarch_clustering(corr_mat, cluster_hierarch, weights): def validate(df_npis_old, df_npis, df_infec_rki, countyID, npiCode, start_npi_cols, npi_incid_start, start_date_validation, end_date_validation, fine_resolution): - if npiCode == 'M01a_150': - x = 15 # TODO if fine_resolution == 1: npiCodes = [npiCode + code @@ -217,7 +215,7 @@ def validate(df_npis_old, df_npis, df_infec_rki, countyID, npiCode, # deactivate values based on incidence before taking # the maximum over NPI group - dummy_old[list(nonactive_index)] = 0 + dummy_old[:, col][list(nonactive_index)] = 0 # the intermediate step is necessary since the array has a 2nd dimension # of size zero if we directly set dummy_old = dummy_old.max(axis=1) @@ -795,93 +793,6 @@ def transform_npi_data(fine_resolution=2, end_date_validation, fine_resolution) if(a != b): print('Error in NPI activation computation') - else: - print(a == b, a, b) - - # Cologne for M01a_010 and all dates (no changes) - dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 5315) & ( - df_npis_old[dd.EngEng['npiCode']] == 'M01a_010')].values[0][start_npi_cols:] - for subcode in range(1, 6): # add subcode values - dummy_old = np.maximum(dummy_old, df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 5315) & ( - df_npis_old[dd.EngEng['npiCode']] == 'M01a_010')].values[subcode][start_npi_cols:]) - dummy_new = df_npis.loc[df_npis[dd.EngEng['idCounty']] == - 5315, 'M01a_010'].values - print(abs(dummy_old-dummy_new).sum() == 0) - - # Flensburg for M05_120 and all dates ('2's become '1's) - dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 1001) & ( - df_npis_old[dd.EngEng['npiCode']] == 'M05_120')].values[0][start_npi_cols:] - for subcode in range(1, 6): # add subcode values - dummy_old = np.maximum(dummy_old, - df_npis_old - [(df_npis_old - [dd.EngEng['idCounty']] == 1001) & - (df_npis_old[dd.EngEng['npiCode']] == - 'M05_120')].values[subcode] - [start_npi_cols:]) - dummy_new = df_npis.loc[df_npis[dd.EngEng['idCounty']] == - 1001, 'M05_120'].values - print(abs(dummy_old-dummy_new).sum() == 5) - - # Munich for M01a_010 and all dates (-99 becomes 0, 0 stays 0) - dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 9162) & ( - df_npis_old[dd.EngEng['npiCode']] == 'M01a_010')].values[0][start_npi_cols:] - for subcode in range(1, 6): # add subcode values - dummy_old = np.maximum(dummy_old, df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 9162) & ( - df_npis_old[dd.EngEng['npiCode']] == 'M01a_010')].values[subcode][start_npi_cols:]) - dummy_new = df_npis.loc[df_npis[dd.EngEng['idCounty']] == - 9162, 'M01a_010'].values - print(abs(dummy_old-dummy_new).sum() == 0) - - # Weimar for M12_030_3 and all dates (-99 becomes 0, 1 stays 1, 0 stays 0) - dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 16071) & ( - df_npis_old[dd.EngEng['npiCode']] == 'M12_030')].values[0][start_npi_cols:] - for subcode in range(1, 6): # add subcode values - dummy_old = np.maximum(dummy_old, - df_npis_old - [(df_npis_old - [dd.EngEng['idCounty']] == 16071) & - (df_npis_old[dd.EngEng['npiCode']] == - 'M12_030')].values[subcode] - [start_npi_cols:]) - dummy_new = df_npis.loc[df_npis[dd.EngEng['idCounty']] == - 16071, 'M12_030'].values - print(abs(dummy_old-dummy_new).sum() == 19) - - # Berlin for M01b_020 and all dates (2 becomes 1, 1 stays 1, 0 stays 0) - dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 11000) & ( - df_npis_old[dd.EngEng['npiCode']] == 'M01b_020')].values[0][start_npi_cols:] - for subcode in range(1, 6): # add subcode values - dummy_old = np.maximum(dummy_old, df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 11000) & ( - df_npis_old[dd.EngEng['npiCode']] == 'M01b_020')].values[subcode][start_npi_cols:]) - dummy_new = df_npis.loc[df_npis[dd.EngEng['idCounty']] == - 11000, 'M01b_020'].values - print(abs(dummy_old-dummy_new).sum() == 82) - - # Segeberg for M02b_035 and all dates (2 -> 1, 3 -> 1, 5 -> 1) - dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 1060) & ( - df_npis_old[dd.EngEng['npiCode']] == 'M02b_035')].values[0][start_npi_cols:] - for subcode in range(1, 6): # add subcode values - dummy_old = np.maximum(dummy_old, df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 1060) & ( - df_npis_old[dd.EngEng['npiCode']] == 'M02b_035')].values[subcode][start_npi_cols:]) - dummy_new = df_npis.loc[df_npis[dd.EngEng['idCounty']] == - 1060, 'M02b_035'].values - print(abs(dummy_old-dummy_new).sum() == 151+2*53+4*22) - - # Steinfurt for M16_050 and all dates (4 -> 1, ...) - dummy_old = df_npis_old[(df_npis_old[dd.EngEng['idCounty']] == 5566) & ( - df_npis_old[dd.EngEng['npiCode']] == 'M16_050')].values[0][start_npi_cols:] - for subcode in range(1, 6): # add subcode values - dummy_old = np.maximum(dummy_old, - df_npis_old - [(df_npis_old - [dd.EngEng['idCounty']] == 5566) & - (df_npis_old[dd.EngEng['npiCode']] == - 'M16_050')].values[subcode] - [start_npi_cols:]) - dummy_new = df_npis.loc[df_npis[dd.EngEng['idCounty']] == - 5566, 'M16_050'].values - print(abs(dummy_old-dummy_new).sum() == 32+2*20+3*22) #### end validation #### if fine_resolution > 0: From 8a8322a602e3bff1011fc05f905e4ffa9164e469 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Thu, 9 Jun 2022 12:27:09 +0200 Subject: [PATCH 013/104] remove placeholder categories --- .../memilio/epidata/transformNPIData.py | 43 ++++++++++++++----- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index 38328fbdab..b00e8f209a 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -370,8 +370,10 @@ def transform_npi_data(fine_resolution=2, # get existing codes that are used (in df_npis_old M22-M24 are empty) npi_codes_prior = df_npis_desc['Variablenname'] + npi_codes_prior_desc = df_npis_desc['Variable'] # correct differences in codes between data sheet and explanation sheet + codes_dropped = [] # no dropping for fine_resolution == 0 if fine_resolution > 0: # correct M04_N codes to M_04_M_N, N in {1,...,5}, M in {120,110,100,130,140} # (M04_1, i.e. i=1, has been corrected in original file but not for i>1) @@ -413,9 +415,21 @@ def transform_npi_data(fine_resolution=2, # on these codes, so drop the rows. codes_dropped = list(set(npi_codes_prior_data).difference( npi_codes_prior)) + # also remove dummy 'Platzhalter' categories + dummy_categories = [] + for i in range(len(npi_codes_prior)): + if 'Platzhalter' in npi_codes_prior_desc[i]: + dummy_categories.append(npi_codes_prior[i]) + # codes without explanation and dummy categories + # sorting done for consistenty, maybe not necessary + codes_dropped = list(np.sort(codes_dropped + dummy_categories)) if len(codes_dropped) > 0: df_npis_old = df_npis_old[~df_npis_old[dd.EngEng['npiCode']].isin( - codes_dropped)].reset_index() + codes_dropped)].reset_index(drop=True) + # for every main code removed, all 5 subcodes have to be removed; + # if this is not the case, the naming of them is wrong/not consistent + if (len(codes_dropped) % 6) != 0: + sys.exit('Error in NPI names, please check.') # sort NPI codes according to numeric values (argsort gives indices # in input list to be used for sorted array) @@ -449,10 +463,15 @@ def transform_npi_data(fine_resolution=2, # extract variable names for main categories npi_desc = list(df_npis_desc["Variable"][npi_codes_sorting]) - # NPIs groups codes and description to ensure that both are ordered - # the same way - npis_dummy = {dd.EngEng['npiCode']: npi_codes, dd.EngEng['desc']: npi_desc} + # NPIs group codes and description to ensure that both are ordered + # the same way; do not use npi_codes or npi_desc hereafter + idx_codes_retained = ~pd.Series(npi_codes).isin(codes_dropped) + npis_dummy = { + dd.EngEng['npiCode']: list(pd.Series(npi_codes)[idx_codes_retained]), + dd.EngEng['desc']: list(pd.Series(npi_desc)[idx_codes_retained])} npis = pd.DataFrame(npis_dummy) + del npi_codes + del npi_desc # transform data from original format to desired format if not read_data: @@ -463,9 +482,9 @@ def transform_npi_data(fine_resolution=2, # create hash table from parental or main code/main category # to list of subcodes/subcategories maincode_to_npicodes_map = dict() - major_code = npi_codes[0] + major_code = npis.iloc[:, 0][0] maincode_to_npicodes_map[major_code] = [] - for code in npi_codes: + for code in npis.iloc[:, 0]: if major_code in code: maincode_to_npicodes_map[major_code].append(code) else: @@ -523,7 +542,7 @@ def transform_npi_data(fine_resolution=2, sys.exit('Error. Other counties than that of Eisenach were removed.') # remove rows for Eisenach df_npis_old = df_npis_old[df_npis_old[dd.EngEng['idCounty']].isin( - unique_geo_entities)].reset_index(drop=True).drop(columns='index') + unique_geo_entities)].reset_index(drop=True) start_npi_cols = list( df_npis_old.columns).index( @@ -617,7 +636,7 @@ def transform_npi_data(fine_resolution=2, df_local_old = df_npis_old[df_npis_old[dd.EngEng['idCounty']] == countyID].copy() - # remove potential rows of which codes are not in npi_codes_considered + # potentially remove rows if they are not in npis dict npi_rows = [i in npis[dd.EngEng['npiCode']].values for i in df_local_old[dd.EngEng['npiCode']]] @@ -760,8 +779,6 @@ def transform_npi_data(fine_resolution=2, #### start validation #### if fine_resolution == 2: - # the following validation is only valid for end_date May 14, 2021 - # and data frames of mentioned NPIs, not active NPIs start_date_validation = datetime(2020, 3, 1) end_date_validation = datetime(2022, 2, 15) @@ -778,6 +795,8 @@ def transform_npi_data(fine_resolution=2, fine_resolution) if(a != b): print('Error in NPI activation computation') + else: + print(a, b, a == b) elif fine_resolution == 1: start_date_validation = datetime(2020, 3, 1) @@ -793,6 +812,8 @@ def transform_npi_data(fine_resolution=2, end_date_validation, fine_resolution) if(a != b): print('Error in NPI activation computation') + else: + print(a, b, a == b) #### end validation #### if fine_resolution > 0: @@ -1078,7 +1099,7 @@ def main(): """! Main program entry.""" # arg_dict = gd.cli("testing") - transform_npi_data(fine_resolution=1, read_data=False, make_plot=True) + transform_npi_data(fine_resolution=2, read_data=False, make_plot=True) if __name__ == "__main__": From 9ca004105dab396a267aac5b5bb8f88c67320a2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Thu, 9 Jun 2022 12:52:58 +0200 Subject: [PATCH 014/104] outsourcing of analysis and plot to WIP function; correction if indentation for fine_res=0 --- .../memilio/epidata/transformNPIData.py | 804 +++++++++--------- 1 file changed, 404 insertions(+), 400 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index b00e8f209a..7ef3139c5a 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -251,12 +251,10 @@ def print_manual_download(filename, url): def transform_npi_data(fine_resolution=2, - read_data=dd.defaultDict['read_data'], file_format=dd.defaultDict['file_format'], out_folder=dd.defaultDict['out_folder'], start_date=dd.defaultDict['start_date'], - end_date=dd.defaultDict['end_date'], - make_plot=dd.defaultDict['make_plot'], + end_date=dd.defaultDict['end_date'] ): """! Loads a certain resolution of recorded NPI data from the Corona Datenplattform and transforms it according to the @@ -295,60 +293,46 @@ def transform_npi_data(fine_resolution=2, directory = os.path.join(directory, 'Germany/') gd.check_dir(directory) - if not read_data: - - if fine_resolution > 0: - # defines delay in number of days between exceeding - # incidence threshold and NPI getting active - npi_activation_delay = 0 - print('Using a delay of NPI activation of ' + - str(npi_activation_delay) + ' days.') - - try: - df_npis_old = pd.read_csv( - os.path.join( - directory, 'kr_massnahmen_unterkategorien.csv'), - sep=',') # , nrows=1248) # 1248 for debugging, only reading Flensburg - except FileNotFoundError: - print_manual_download( - 'kr_massnahmen_unterkategorien.csv', - 'https://www.corona-datenplattform.de/dataset/massnahmen_unterkategorien_kreise') - raise FileNotFoundError - df_npis_old.rename(dd.GerEng, axis=1, inplace=True) - - # check if rows hospitals and geriatric care are still empty - # these fields have been empty so far and are thus not used - test_codes = ['M23_010', 'M23_020', 'M23_030', 'M23_040', - 'M23_050', 'M23_060', 'M24_010', 'M24_020', - 'M24_030', 'M24_040', 'M24_050', 'M24_060'] - for tcode in test_codes: - for i in [''] + ["_" + str(i) for i in range(1, 6)]: - if(df_npis_old[df_npis_old[dd.EngEng['npiCode']] == tcode+i].iloc[:, 6:].max().max() > 0): - print(tcode+i + " used.") - # end check - - else: # read aggregated NPIs - - try: - df_npis_old = pd.read_csv(os.path.join( - directory, 'kr_massnahmen_oberkategorien.csv')) - except FileNotFoundError: - print_manual_download( - 'datensatzbeschreibung_massnahmen.xlsx', - 'https://www.corona-datenplattform.de/dataset/massnahmen_oberkategorien_kreise') - raise FileNotFoundError - df_npis_old.rename(dd.GerEng, axis=1, inplace=True) + if fine_resolution > 0: + # defines delay in number of days between exceeding + # incidence threshold and NPI getting active + npi_activation_delay = 0 + print('Using a delay of NPI activation of ' + + str(npi_activation_delay) + ' days.') - else: # read formatted file + try: + df_npis_old = pd.read_csv( + os.path.join(directory, 'kr_massnahmen_unterkategorien.csv'), + sep=',') # , nrows=1248) # 1248 for debugging, only reading Flensburg + except FileNotFoundError: + print_manual_download( + 'kr_massnahmen_unterkategorien.csv', + 'https://www.corona-datenplattform.de/dataset/massnahmen_unterkategorien_kreise') + raise FileNotFoundError + df_npis_old.rename(dd.GerEng, axis=1, inplace=True) + + # check if rows hospitals and geriatric care are still empty; + # these fields have been empty so far and are thus not used + test_codes = ['M23_010', 'M23_020', 'M23_030', 'M23_040', + 'M23_050', 'M23_060', 'M24_010', 'M24_020', + 'M24_030', 'M24_040', 'M24_050', 'M24_060'] + for tcode in test_codes: + for i in [''] + ["_" + str(i) for i in range(1, 6)]: + if(df_npis_old[df_npis_old[dd.EngEng['npiCode']] == tcode+i].iloc[:, 6:].max().max() > 0): + print(tcode+i + " used.") + # end check + + else: # read aggregated NPIs - if fine_resolution > 0: - if fine_resolution == 1: - filename = 'germany_counties_npi_subcat_incgrouped' - else: - filename = 'germany_counties_npi_subcat' - else: - filename = 'germany_counties_npi_maincat' - df_npis = pd.read_json(directory + filename + ".json") + try: + df_npis_old = pd.read_csv(os.path.join( + directory, 'kr_massnahmen_oberkategorien.csv')) + except FileNotFoundError: + print_manual_download( + 'datensatzbeschreibung_massnahmen.xlsx', + 'https://www.corona-datenplattform.de/dataset/massnahmen_oberkategorien_kreise') + raise FileNotFoundError + df_npis_old.rename(dd.GerEng, axis=1, inplace=True) # read data frame of variable names and descriptions try: @@ -390,10 +374,7 @@ def transform_npi_data(fine_resolution=2, npi_codes_prior[npi_codes_prior == 'M16_200_2'] = 'M16_100_2' # check for missing codes - if not read_data: - npi_codes_prior_data = df_npis_old[dd.EngEng['npiCode']].unique() - else: - npi_codes_prior_data = list(df_npis.columns[2:]) + npi_codes_prior_data = df_npis_old[dd.EngEng['npiCode']].unique() missing_codes = list(set(npi_codes_prior).difference( npi_codes_prior_data)) @@ -463,8 +444,8 @@ def transform_npi_data(fine_resolution=2, # extract variable names for main categories npi_desc = list(df_npis_desc["Variable"][npi_codes_sorting]) - # NPIs group codes and description to ensure that both are ordered - # the same way; do not use npi_codes or npi_desc hereafter + # combine NPI codes and descriptions to ensure that both are ordered + # the same way; delete npi_codes or npi_desc for not using hereafter idx_codes_retained = ~pd.Series(npi_codes).isin(codes_dropped) npis_dummy = { dd.EngEng['npiCode']: list(pd.Series(npi_codes)[idx_codes_retained]), @@ -473,55 +454,53 @@ def transform_npi_data(fine_resolution=2, del npi_codes del npi_desc - # transform data from original format to desired format - if not read_data: - # prepare grouping of NPIs to reduce product space of - # NPI x active_from_inc (with values "incidence does not matter", and - # incidence 0, 10, 35, 50, 100) to NPI - if fine_resolution == 1: - # create hash table from parental or main code/main category - # to list of subcodes/subcategories - maincode_to_npicodes_map = dict() - major_code = npis.iloc[:, 0][0] - maincode_to_npicodes_map[major_code] = [] - for code in npis.iloc[:, 0]: - if major_code in code: - maincode_to_npicodes_map[major_code].append(code) - else: - major_code = code - maincode_to_npicodes_map[major_code] = [code] + # prepare grouping of NPIs to reduce product space of + # NPI x active_from_inc (with values "incidence does not matter", and + # incidence 0, 10, 35, 50, 100) to NPI + if fine_resolution == 1: + # create hash table from parental or main code/main category + # to list of subcodes/subcategories + maincode_to_npicodes_map = dict() + major_code = npis.iloc[:, 0][0] + maincode_to_npicodes_map[major_code] = [] + for code in npis.iloc[:, 0]: + if major_code in code: + maincode_to_npicodes_map[major_code].append(code) + else: + major_code = code + maincode_to_npicodes_map[major_code] = [code] - npi_codes_aggregated = [] - for main_code in maincode_to_npicodes_map.keys(): - if main_code.count('_') > 1: - sys.exit('Error. Subcode assigned as main code.') - npi_codes_aggregated.append(main_code) + npi_codes_aggregated = [] + for main_code in maincode_to_npicodes_map.keys(): + if main_code.count('_') > 1: + sys.exit('Error. Subcode assigned as main code.') + npi_codes_aggregated.append(main_code) - npis_final = npis[npis[dd.EngEng['npiCode']].isin( - npi_codes_aggregated)].reset_index() - else: - npis_final = npis + npis_final = npis[npis[dd.EngEng['npiCode']].isin( + npi_codes_aggregated)].reset_index() + else: + npis_final = npis - # extract incidence-threshold for NPIs - if fine_resolution > 0: - npi_incid_start = dict() - for i in range(len(npis)): - incid_threshold = 1e10 - if npis.loc[i, dd.EngEng['desc']].split(' ')[0] == 'Unabhängig': - # set -1 for incidence-independent NPIs - incid_threshold = -1 - elif npis.loc[i, dd.EngEng['desc']].split(' ')[0] == 'Ab': - incid_threshold = int( - npis.loc[i, dd.EngEng['desc']].split(' ')[1]) - else: - sys.exit( - 'Error in description file. NPI activation can not ' - 'be computed. Exiting.') - npi_incid_start[npis.loc[i, dd.EngEng['npiCode']] - ] = incid_threshold + # extract incidence-threshold for NPIs + if fine_resolution > 0: + npi_incid_start = dict() + for i in range(len(npis)): + incid_threshold = 1e10 + if npis.loc[i, dd.EngEng['desc']].split(' ')[0] == 'Unabhängig': + # set -1 for incidence-independent NPIs + incid_threshold = -1 + elif npis.loc[i, dd.EngEng['desc']].split(' ')[0] == 'Ab': + incid_threshold = int( + npis.loc[i, dd.EngEng['desc']].split(' ')[1]) + else: + sys.exit( + 'Error in description file. NPI activation can not ' + 'be computed. Exiting.') + npi_incid_start[npis.loc[i, dd.EngEng['npiCode']] + ] = incid_threshold - # get all incidence thresholds - incidence_thresholds = sorted(set(npi_incid_start.values())) + # get all incidence thresholds + incidence_thresholds = sorted(set(npi_incid_start.values())) # create hash map from thresholds to NPI indices incidence_thresholds_to_npis = dict( @@ -531,290 +510,324 @@ def transform_npi_data(fine_resolution=2, [i, dd.EngEng['npiCode']]] incidence_thresholds_to_npis[incval].append(i) - # get county ids - unique_geo_entities = geoger.get_county_ids() - # check if more than the county of Eisenach would be removed with - # current county list - counties_removed = df_npis_old[ - ~df_npis_old[dd.EngEng['idCounty']].isin(unique_geo_entities)][ - dd.EngEng['idCounty']].unique() - if len(counties_removed) == 1 and counties_removed[0] != 16056: - sys.exit('Error. Other counties than that of Eisenach were removed.') - # remove rows for Eisenach - df_npis_old = df_npis_old[df_npis_old[dd.EngEng['idCounty']].isin( - unique_geo_entities)].reset_index(drop=True) - - start_npi_cols = list( - df_npis_old.columns).index( - dd.EngEng['npiCode']) + 1 - - # store string dates 'dYYYYMMDD' in list before parsing - str_dates = list(df_npis_old.iloc[:, start_npi_cols:].columns) - # convert string dates into other format - dates_new = [datetime.strptime(old_date, "d%Y%m%d") - for old_date in str_dates] - - # check for missing dates - date_diff = [ - (dates_new[i + 1] - dates_new[i]).days - for i in range(len(dates_new) - 1)] - date_diff_idx = np.where(np.array(date_diff) > 1)[0] - if max(date_diff) > 1: - print("Error. Dates missing in data frame:") - for i in date_diff_idx: - print( - "\t - From " + str(dates_new[i] + timedelta(1)) + " until " + - str(dates_new[i] + timedelta(date_diff[i] - 1))) - sys.exit('Exiting. Dates missing in data frame.') - - min_date = [] - max_date = [] - - # get RKI infectious numbers to find dates where incidence-dependent - # NPIs were active + # get county ids + unique_geo_entities = geoger.get_county_ids() + # check if more than the county of Eisenach would be removed with + # current county list + counties_removed = df_npis_old[ + ~df_npis_old[dd.EngEng['idCounty']].isin(unique_geo_entities)][ + dd.EngEng['idCounty']].unique() + if len(counties_removed) == 1 and counties_removed[0] != 16056: + sys.exit('Error. Other counties than that of Eisenach were removed.') + # remove rows for Eisenach + df_npis_old = df_npis_old[df_npis_old[dd.EngEng['idCounty']].isin( + unique_geo_entities)].reset_index(drop=True) + + start_npi_cols = list( + df_npis_old.columns).index( + dd.EngEng['npiCode']) + 1 + + # store string dates 'dYYYYMMDD' in list before parsing + str_dates = list(df_npis_old.iloc[:, start_npi_cols:].columns) + # convert string dates into other format + dates_new = [datetime.strptime(old_date, "d%Y%m%d") + for old_date in str_dates] + + # check for missing dates + date_diff = [ + (dates_new[i + 1] - dates_new[i]).days + for i in range(len(dates_new) - 1)] + date_diff_idx = np.where(np.array(date_diff) > 1)[0] + if max(date_diff) > 1: + print("Error. Dates missing in data frame:") + for i in date_diff_idx: + print( + "\t - From " + str(dates_new[i] + timedelta(1)) + " until " + + str(dates_new[i] + timedelta(date_diff[i] - 1))) + sys.exit('Exiting. Dates missing in data frame.') + + min_date = [] + max_date = [] + + # get RKI infectious numbers to find dates where incidence-dependent + # NPIs were active + if fine_resolution > 0: + df_infec_rki = pd.read_json(os.path.join( + directory, 'all_county_all_dates_repdate_rki.json')) + df_infec_rki[dd.EngEng['date']] = pd.to_datetime( + df_infec_rki[dd.EngEng['date']]) + df_population = pd.read_json( + directory + "county_current_population.json") + min_date.append( + df_infec_rki[dd.EngEng['date']].min().to_pydatetime()) + max_date.append( + df_infec_rki[dd.EngEng['date']].max().to_pydatetime()) + + # adapt time series according to available dates and start_date, + # end_date input parameter + start_date_new = max( + min_date + [min(dates_new), pd.to_datetime(start_date)]) + end_date_new = min( + max_date + [max(dates_new), + pd.to_datetime(end_date)]) + + # create new data frame for all NPIs given in the columns, + # resolved by county and day + df_npis = pd.DataFrame( + columns=[dd.EngEng['date']] + [dd.EngEng['idCounty']] + + list(npis_final[dd.EngEng['npiCode']])) + # convert NPI data from object to int such that correlations can be + # computed + df_npis = df_npis.astype(dict( + zip( + [dd.EngEng['date']] + [dd.EngEng['idCounty']] + + list(npis_final[dd.EngEng['npiCode']]), ['str', 'int'] + + ['int' for i in npis_final[dd.EngEng['npiCode']]]))) + + # iterate over countyIDs + counters = np.zeros(5) # time counter for output only + countyidx = 0 + # unique_geo_entities: + for countyID in [5315, 1001, 9162, 16071, 11000, 1060, 5566]: + cid = 0 + countyidx += 1 + if fine_resolution > 0: - df_infec_rki = pd.read_json(os.path.join( - directory, 'all_county_all_dates_repdate_rki.json')) - df_infec_rki[dd.EngEng['date']] = pd.to_datetime( - df_infec_rki[dd.EngEng['date']]) - df_population = pd.read_json( - directory + "county_current_population.json") - min_date.append( - df_infec_rki[dd.EngEng['date']].min().to_pydatetime()) - max_date.append( - df_infec_rki[dd.EngEng['date']].max().to_pydatetime()) - - # adapt time series according to available dates and start_date, - # end_date input parameter - start_date_new = max( - min_date + [min(dates_new), pd.to_datetime(start_date)]) - end_date_new = min( - max_date + [max(dates_new), - pd.to_datetime(end_date)]) - - # create new data frame for all NPIs given in the columns, - # resolved by county and day - df_npis = pd.DataFrame( + # compute incidence based on previous data frames + df_infec_local = df_infec_rki[df_infec_rki[dd.EngEng['idCounty']] == countyID].copy( + ) + pop_local = df_population.loc[df_population[dd.EngEng['idCounty']] + == countyID, dd.EngEng['population']].values[0] + incidence_local = df_infec_local[dd.EngEng['confirmed']].diff( + periods=7).fillna(df_infec_local[dd.EngEng['confirmed']]) + df_infec_local['Incidence'] = incidence_local / pop_local * 100000 + + # set to main data frame + df_infec_rki.loc[df_infec_rki[dd.EngEng['idCounty']] == + countyID, 'Incidence'] = df_infec_local['Incidence'].values + + # cut infection information at start_date_new and end_date_new + df_infec_local = df_infec_local.loc[(df_infec_local[dd.EngEng['date']] >= start_date_new) & ( + df_infec_local[dd.EngEng['date']] <= end_date_new), :].reset_index() + + # get county-local data frame + start_time = time.perf_counter() + df_local_old = df_npis_old[df_npis_old[dd.EngEng['idCounty']] + == countyID].copy() + + # potentially remove rows if they are not in npis dict + npi_rows = [i in npis[dd.EngEng['npiCode']].values + for i in df_local_old[dd.EngEng['npiCode']]] + + # get list of NPI codes, ordered as the rows in the current data frame + npi_codes_ordered_as_rows = df_local_old[dd.EngEng['npiCode']][ + npi_rows].to_list() + + # get indices of rows for the NPI codes as in the sorted npi_codes list + # may be superfluous if NPI code rows are sorted correctly + npi_code_rows_to_sorted = [ + npi_codes_ordered_as_rows.index(i) for i in + npis[dd.EngEng['npiCode']].values] + + # access NPI values matrix and store it as integers + npi_vals = df_local_old.iloc[npi_rows, start_npi_cols:].astype(int) + + # create columns for date, county ID and NPI code + df_local_new = pd.DataFrame( columns=[dd.EngEng['date']] + [dd.EngEng['idCounty']] + - list(npis_final[dd.EngEng['npiCode']])) - # convert NPI data from object to int such that correlations can be - # computed - df_npis = df_npis.astype(dict( - zip( - [dd.EngEng['date']] + [dd.EngEng['idCounty']] + - list(npis_final[dd.EngEng['npiCode']]), ['str', 'int'] + - ['int' for i in npis_final[dd.EngEng['npiCode']]]))) - - # iterate over countyIDs - counters = np.zeros(5) # time counter for output only - countyidx = 0 - # unique_geo_entities: - for countyID in [5315, 1001, 9162, 16071, 11000, 1060, 5566]: - cid = 0 - countyidx += 1 + list(npis[dd.EngEng['npiCode']])) + + counters[cid] += time.perf_counter()-start_time + cid += 1 + + start_time = time.perf_counter() + + # fill in NPI values by transposing from columns to rows + df_local_new[dd.EngEng['date']] = dates_new + df_local_new[dd.EngEng['idCounty']] = countyID + # possible resorting of rows such that they are sorted according to + # a literal sorting of the code strings + df_local_new[npis[dd.EngEng['npiCode']]] = np.transpose( + npi_vals.iloc[npi_code_rows_to_sorted, :].values) + + counters[cid] += time.perf_counter()-start_time + cid += 1 + + start_time = time.perf_counter() + + # replace -99 ("not used anymore") by 0 ("not used") + df_local_new[npis[dd.EngEng['npiCode']] + ] = df_local_new[npis[dd.EngEng['npiCode']]].replace(-99, 0) + # replace 2,3,4,5 ("mentioned in ...") by 1 ("mentioned") + df_local_new[npis[dd.EngEng['npiCode']] + ] = df_local_new[npis[dd.EngEng['npiCode']]].replace([2, 3, 4, 5], 1) + + counters[cid] += time.perf_counter()-start_time + cid += 1 + + ### evaluate NPIs mentioned with respect to confirmed cases ### + # values > 0 + # - for NPIs independent of new infections mean "mentioned" = "active" + # - for NPIs dependent on incidence "mentioned" does not mean + # active and evaluation has to be conducted against confirmed + # infections to determine whether the NPI was active + start_time = time.perf_counter() + if fine_resolution > 0: + # cut NPI information at start_date_new and end_date_new + df_local_new = df_local_new.loc[(df_local_new[dd.EngEng['date']] >= start_date_new) & ( + df_local_new[dd.EngEng['date']] <= end_date_new), :].reset_index() + try: + df_local_new = df_local_new.drop(columns='index') + except: + pass + # get index of first NPI column in local data frame + npis_idx_start = list( + df_local_new.columns).index( + npis[dd.EngEng['npiCode']][0]) + + # iterate through all NPIs and activate if incidence threshold + # is exceeded + for incidvalthrsh, npi_indices in incidence_thresholds_to_npis.items(): + if incidvalthrsh >= 0: + local_incid = df_infec_local['Incidence'].copy() + if npi_activation_delay > 0: + # shift values to npi_activation_delay days later + local_incid.iloc[npi_activation_delay: + ] = local_incid.iloc[0:-npi_activation_delay].values + # take constant value of day 0 for first delay days + local_incid.iloc[:npi_activation_delay] = local_incid.iloc[0] + # compare incidence against threshold + int_active = ( + local_incid >= incidvalthrsh).astype(int) + # multiply rows of data frame by either 1 if threshold + # passed (i.e., mentioned NPI is active) or zero + # (i.e., mentioned NPI is not active) + # 'mul' multiplies the original data frame row by row + # with the respective value in int_active + df_local_new.iloc[:, npis_idx_start + np.array(npi_indices)] \ + = df_local_new.iloc[:, npis_idx_start + np.array(npi_indices)].mul(int_active, axis=0) + + # reduction of factor space NPI x incidence threshold to NPI + # by max aggregation of all incidence threshold columns per NPI + if fine_resolution == 1: + for main_code, codes_group in maincode_to_npicodes_map.items(): + # group by incidence (former codes X1_Y, X1_Z were transformed + # to X1, X2) and write max value to main code column + df_local_new.loc[:, main_code] = df_local_new.loc[:, codes_group].max( + axis=1) + # remove subcategory columns + df_local_new = df_local_new.loc[:, [ + dd.EngEng['date'], dd.EngEng['idCounty']] + npi_codes_aggregated].copy() + + counters[cid] += time.perf_counter()-start_time + cid += 1 + ### ### + + start_time = time.perf_counter() + df_npis = df_npis.append(df_local_new, + ignore_index=True).copy() + counters[cid] += time.perf_counter()-start_time + cid += 1 + + # divide working time by completed number of counties and multiply + # by remaining number of counties to estimate time remaining + time_remain = sum( + counters) / countyidx * (len(unique_geo_entities) - countyidx) + # print progress + if countyidx == 1 or countyidx % int( + len(unique_geo_entities) / 10) == 0: + print('Progress ' + str(countyidx) + ' / ' + + str(len(unique_geo_entities)) + + '. Estimated time remaining: ' + + str(int(time_remain / 60)) + ' min.') + + # print sub counters + print('Sub task counters are: ') + print(counters) + + # reset index and drop old index column + df_npis.reset_index(inplace=True) + try: + df_npis = df_npis.drop(columns='index') + except: + pass + try: + df_npis = df_npis.drop(columns='level_0') + except: + pass - if fine_resolution > 0: - # compute incidence based on previous data frames - df_infec_local = df_infec_rki[df_infec_rki[dd.EngEng['idCounty']] == countyID].copy( - ) - pop_local = df_population.loc[df_population[dd.EngEng['idCounty']] - == countyID, dd.EngEng['population']].values[0] - incidence_local = df_infec_local[dd.EngEng['confirmed']].diff( - periods=7).fillna(df_infec_local[dd.EngEng['confirmed']]) - df_infec_local['Incidence'] = incidence_local / pop_local * 100000 - - # set to main data frame - df_infec_rki.loc[df_infec_rki[dd.EngEng['idCounty']] == - countyID, 'Incidence'] = df_infec_local['Incidence'].values - - # cut infection information at start_date_new and end_date_new - df_infec_local = df_infec_local.loc[(df_infec_local[dd.EngEng['date']] >= start_date_new) & ( - df_infec_local[dd.EngEng['date']] <= end_date_new), :].reset_index() - - # get county-local data frame - start_time = time.perf_counter() - df_local_old = df_npis_old[df_npis_old[dd.EngEng['idCounty']] - == countyID].copy() - - # potentially remove rows if they are not in npis dict - npi_rows = [i in npis[dd.EngEng['npiCode']].values - for i in df_local_old[dd.EngEng['npiCode']]] - - # get list of NPI codes, ordered as the rows in the current data frame - npi_codes_ordered_as_rows = df_local_old[dd.EngEng['npiCode']][ - npi_rows].to_list() - - # get indices of rows for the NPI codes as in the sorted npi_codes list - # may be superfluous if NPI code rows are sorted correctly - npi_code_rows_to_sorted = [ - npi_codes_ordered_as_rows.index(i) for i in - npis[dd.EngEng['npiCode']].values] - - # access NPI values matrix and store it as integers - npi_vals = df_local_old.iloc[npi_rows, start_npi_cols:].astype(int) - - # create columns for date, county ID and NPI code - df_local_new = pd.DataFrame( - columns=[dd.EngEng['date']] + [dd.EngEng['idCounty']] + - list(npis[dd.EngEng['npiCode']])) - - counters[cid] += time.perf_counter()-start_time - cid += 1 - - start_time = time.perf_counter() - - # fill in NPI values by transposing from columns to rows - df_local_new[dd.EngEng['date']] = dates_new - df_local_new[dd.EngEng['idCounty']] = countyID - # possible resorting of rows such that they are sorted according to - # a literal sorting of the code strings - df_local_new[npis[dd.EngEng['npiCode']]] = np.transpose( - npi_vals.iloc[npi_code_rows_to_sorted, :].values) - - counters[cid] += time.perf_counter()-start_time - cid += 1 - - start_time = time.perf_counter() - - # replace -99 ("not used anymore") by 0 ("not used") - df_local_new[npis[dd.EngEng['npiCode']] - ] = df_local_new[npis[dd.EngEng['npiCode']]].replace(-99, 0) - # replace 2,3,4,5 ("mentioned in ...") by 1 ("mentioned") - df_local_new[npis[dd.EngEng['npiCode']] - ] = df_local_new[npis[dd.EngEng['npiCode']]].replace([2, 3, 4, 5], 1) - - counters[cid] += time.perf_counter()-start_time - cid += 1 - - ### evaluate NPIs mentioned with respect to confirmed cases ### - # values > 0 - # - for NPIs independent of new infections mean "mentioned" = "active" - # - for NPIs dependent on incidence "mentioned" does not mean - # active and evaluation has to be conducted against confirmed - # infections to determine whether the NPI was active - start_time = time.perf_counter() - if fine_resolution > 0: - # cut NPI information at start_date_new and end_date_new - df_local_new = df_local_new.loc[(df_local_new[dd.EngEng['date']] >= start_date_new) & ( - df_local_new[dd.EngEng['date']] <= end_date_new), :].reset_index() - try: - df_local_new = df_local_new.drop(columns='index') - except: - pass - # get index of first NPI column in local data frame - npis_idx_start = list( - df_local_new.columns).index( - npis[dd.EngEng['npiCode']][0]) - - # iterate through all NPIs and activate if incidence threshold - # is exceeded - for incidvalthrsh, npi_indices in incidence_thresholds_to_npis.items(): - if incidvalthrsh >= 0: - local_incid = df_infec_local['Incidence'].copy() - if npi_activation_delay > 0: - # shift values to npi_activation_delay days later - local_incid.iloc[npi_activation_delay: - ] = local_incid.iloc[0:-npi_activation_delay].values - # take constant value of day 0 for first delay days - local_incid.iloc[:npi_activation_delay] = local_incid.iloc[0] - # compare incidence against threshold - int_active = ( - local_incid >= incidvalthrsh).astype(int) - # multiply rows of data frame by either 1 if threshold - # passed (i.e., mentioned NPI is active) or zero - # (i.e., mentioned NPI is not active) - # 'mul' multiplies the original data frame row by row - # with the respective value in int_active - df_local_new.iloc[:, npis_idx_start + np.array(npi_indices)] \ - = df_local_new.iloc[:, npis_idx_start + np.array(npi_indices)].mul(int_active, axis=0) - - # reduction of factor space NPI x incidence threshold to NPI - # by max aggregation of all incidence threshold columns per NPI - if fine_resolution == 1: - for main_code, codes_group in maincode_to_npicodes_map.items(): - # group by incidence (former codes X1_Y, X1_Z were transformed - # to X1, X2) and write max value to main code column - df_local_new.loc[:, main_code] = df_local_new.loc[:, codes_group].max( - axis=1) - # remove subcategory columns - df_local_new = df_local_new.loc[:, [ - dd.EngEng['date'], dd.EngEng['idCounty']] + npi_codes_aggregated].copy() - - counters[cid] += time.perf_counter()-start_time - cid += 1 - ### ### - - start_time = time.perf_counter() - df_npis = df_npis.append(df_local_new, - ignore_index=True).copy() - counters[cid] += time.perf_counter()-start_time - cid += 1 - - # divide working time by completed number of counties and multiply - # by remaining number of counties to estimate time remaining - time_remain = sum( - counters) / countyidx * (len(unique_geo_entities) - countyidx) - # print progress - if countyidx == 1 or countyidx % int( - len(unique_geo_entities) / 10) == 0: - print('Progress ' + str(countyidx) + ' / ' + - str(len(unique_geo_entities)) + - '. Estimated time remaining: ' + - str(int(time_remain / 60)) + ' min.') - - # print sub counters - print('Sub task counters are: ') - print(counters) - - # reset index and drop old index column - df_npis.reset_index(inplace=True) - try: - df_npis = df_npis.drop(columns='index') - except: - pass - try: - df_npis = df_npis.drop(columns='level_0') - except: - pass - - #### start validation #### - if fine_resolution == 2: - start_date_validation = datetime(2020, 3, 1) - end_date_validation = datetime(2022, 2, 15) - - for countyID in [5315, 1001, 9162, 16071, 11000, 1060, 5566]: - for npiCode in [ - 'M01a_010', 'M01a_150', 'M05_120', 'M01a_010', - 'M18_030', 'M01b_020', 'M02b_035', 'M16_050']: - for subcode in [''] + ['_'+str(i) for i in range(1, 6)]: - [ - a, b, oldf, newf] = validate( - df_npis_old, df_npis, df_infec_rki, countyID, - npiCode + subcode, start_npi_cols, npi_incid_start, - start_date_validation, end_date_validation, - fine_resolution) - if(a != b): - print('Error in NPI activation computation') - else: - print(a, b, a == b) - - elif fine_resolution == 1: - start_date_validation = datetime(2020, 3, 1) - end_date_validation = datetime(2022, 2, 15) - - for countyID in [5315, 1001, 9162, 16071, 11000, 1060, 5566]: - for npiCode in [ - 'M01a_010', 'M01a_150', 'M05_120', 'M01a_010', - 'M18_030', 'M01b_020', 'M02b_035', 'M16_050']: - [a, b, oldf, newf] = validate(df_npis_old, df_npis, - df_infec_rki, countyID, npiCode, start_npi_cols, - npi_incid_start, start_date_validation, - end_date_validation, fine_resolution) + #### start validation #### + if fine_resolution == 2: + start_date_validation = datetime(2020, 3, 1) + end_date_validation = datetime(2022, 2, 15) + + for countyID in [5315, 1001, 9162, 16071, 11000, 1060, 5566]: + for npiCode in [ + 'M01a_010', 'M01a_150', 'M05_120', 'M01a_010', + 'M18_030', 'M01b_020', 'M02b_035', 'M16_050']: + for subcode in [''] + ['_'+str(i) for i in range(1, 6)]: + [ + a, b, oldf, newf] = validate( + df_npis_old, df_npis, df_infec_rki, countyID, + npiCode + subcode, start_npi_cols, npi_incid_start, + start_date_validation, end_date_validation, + fine_resolution) if(a != b): print('Error in NPI activation computation') else: print(a, b, a == b) - #### end validation #### + + elif fine_resolution == 1: + start_date_validation = datetime(2020, 3, 1) + end_date_validation = datetime(2022, 2, 15) + + for countyID in [5315, 1001, 9162, 16071, 11000, 1060, 5566]: + for npiCode in [ + 'M01a_010', 'M01a_150', 'M05_120', 'M01a_010', + 'M18_030', 'M01b_020', 'M02b_035', 'M16_050']: + [a, b, oldf, newf] = validate(df_npis_old, df_npis, + df_infec_rki, countyID, npiCode, start_npi_cols, + npi_incid_start, start_date_validation, + end_date_validation, fine_resolution) + if(a != b): + print('Error in NPI activation computation') + else: + print(a, b, a == b) + #### end validation #### + + if fine_resolution > 0: + if fine_resolution == 1: + filename = 'germany_counties_npi_subcat_incgrouped' + else: + filename = 'germany_counties_npi_subcat' + else: + filename = 'germany_counties_npi_maincat' + gd.write_dataframe(df_npis, directory, filename, file_format) + + # stupid validation + # df_validation = pd.read_json(directory + filename + ".json") + # if len( + # np.where( + # df_validation.iloc[:, start_npi_cols - 1:] != df_npis.iloc + # [:, start_npi_cols - 1:])[0]) > 0: + # print('Error in file writing/reading') + + +def analyze_npi_data( + read_data=dd.defaultDict['read_data'], + make_plot=dd.defaultDict['make_plot']): + + if not read_data: + x = 15 + # transform_npi_data(fine_resolution=2, + # file_format=dd.defaultDict['file_format'], + # out_folder=dd.defaultDict['out_folder'], + # start_date=dd.defaultDict['start_date'], + # end_date=dd.defaultDict['end_date'], + # make_plot=dd.defaultDict['make_plot'], + # ) + + else: # read formatted file if fine_resolution > 0: if fine_resolution == 1: @@ -823,22 +836,13 @@ def transform_npi_data(fine_resolution=2, filename = 'germany_counties_npi_subcat' else: filename = 'germany_counties_npi_maincat' - gd.write_dataframe(df_npis, directory, filename, file_format) - - # stupid validation - # df_validation = pd.read_json(directory + filename + ".json") - # if len( - # np.where( - # df_validation.iloc[:, start_npi_cols - 1:] != df_npis.iloc - # [:, start_npi_cols - 1:])[0]) > 0: - # print('Error in file writing/reading') - - # get code levels (main/subcodes) and position of main codes - # code_level = [i.count('_') for i in npi_codes] - # main_code_pos = [i for i in range(len(code_level)) if code_level[i] == 1] - - # check if any other integer than 0: not implemented or 1: implemented is - # used (maybe to specify the kind of implementation) + df_npis = pd.read_json(directory + filename + ".json") + # get code levels (main/subcodes) and position of main codes + # code_level = [i.count('_') for i in npi_codes] + # main_code_pos = [i for i in range(len(code_level)) if code_level[i] == 1] + + # check if any other integer than 0: not implemented or 1: implemented is + # used (maybe to specify the kind of implementation) if len(np.where(df_npis[npi_codes_considered] > 1)[0]) > 0: print("Info: Please ensure that NPI information is only boolean.") @@ -1099,7 +1103,7 @@ def main(): """! Main program entry.""" # arg_dict = gd.cli("testing") - transform_npi_data(fine_resolution=2, read_data=False, make_plot=True) + transform_npi_data(fine_resolution=0) if __name__ == "__main__": From 57d4c698a9928a24081eade5185d855f8a209fcc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Thu, 9 Jun 2022 17:17:24 +0200 Subject: [PATCH 015/104] Start of NPI combination matrix use; WIP --- .../memilio/epidata/transformNPIData.py | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index 7ef3139c5a..5c3c036f07 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -510,6 +510,48 @@ def transform_npi_data(fine_resolution=2, [i, dd.EngEng['npiCode']]] incidence_thresholds_to_npis[incval].append(i) + # for fine_resolution == 2 deactivation of non-combinable + # incidence-dependent NPIs has to be conducted; therefore we defined a + # matrix of possible combinations of NPIs (marked with an X if combinable) + # NPIs of different main category (e.g., M01a and M04) can always be + # combined; only those of, e.g., M01a_010_3 and M01a_080_4 can exclude each + # other + if fine_resolution == 2: + df_npis_combinations_pre = pd.read_excel( + os.path.join( + directory, 'combination_npis.xlsx')) + + # rename essential columns and throw away others + column_names = ['Unnamed: ' + str(i) for i in range(3, 19)] + rename_columns = {column_names[i]: i for i in range(len(column_names))} + df_npis_combinations_pre.rename(columns=rename_columns, inplace=True) + df_npis_combinations_pre = df_npis_combinations_pre[[ + 'Variablenname'] + [i for i in range(0, 16)]] + + # extract different NPI groups and store indices of NPIs belonging + # to the different groups + npi_groups_combinations = pd.Series( + code.split('_')[0] + for code in df_npis_combinations_pre['Variablenname']) + npi_groups_combinations_unique = npi_groups_combinations.unique() + npi_groups_idx = [] + for code in npi_groups_combinations_unique: + npi_groups_idx.append( + list( + npi_groups_combinations + [npi_groups_combinations == code].index)) + # create hash table of main code to combination matrix + df_npis_combinations = { + npi_groups_combinations_unique[i]: np.zeros( + (len(npi_groups_idx[i]), + len(npi_groups_idx[i]))) + for i in range(len(npi_groups_combinations_unique))} + + # run through all groups and set possible combinations according to + # read combination matrix + for i in range(len(npi_groups_idx)): + npi_groups_idx[i] # TODO + # get county ids unique_geo_entities = geoger.get_county_ids() # check if more than the county of Eisenach would be removed with From b6b3379f30a51ed62710c2529385aaff2ae0e355 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Fri, 10 Jun 2022 17:17:40 +0200 Subject: [PATCH 016/104] Read combination matrix and export to clean format --- .../memilio/epidata/transformNPIData.py | 113 +++++++++++------- 1 file changed, 70 insertions(+), 43 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index 5c3c036f07..ca939f64bf 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -356,6 +356,75 @@ def transform_npi_data(fine_resolution=2, npi_codes_prior = df_npis_desc['Variablenname'] npi_codes_prior_desc = df_npis_desc['Variable'] + # for fine_resolution == 2 deactivation of non-combinable + # incidence-dependent NPIs has to be conducted; therefore we defined a + # matrix of possible combinations of NPIs (marked with an X if combinable) + # NPIs of different main category (e.g., M01a and M04) can always be + # combined; only those of, e.g., M01a_010_3 and M01a_080_4 can exclude each + # other + if fine_resolution == 2: + df_npis_combinations_pre = pd.read_excel( + os.path.join( + directory, 'combination_npis.xlsx')) + + # rename essential columns and throw away others + column_names = ['Unnamed: ' + str(i) for i in range(3, 19)] + rename_columns = {column_names[i]: i for i in range(len(column_names))} + df_npis_combinations_pre.rename(columns=rename_columns, inplace=True) + df_npis_combinations_pre = df_npis_combinations_pre[[ + 'Variablenname'] + [i for i in range(0, 16)]] + # replace empty cells by zeros and x-marked cells by ones + df_npis_combinations_pre = df_npis_combinations_pre.replace(np.nan, 0) + df_npis_combinations_pre = df_npis_combinations_pre.replace('x', 1) + + # extract different NPI groups and store indices of NPIs belonging + # to the different groups + npi_groups_combinations = pd.Series( + code.split('_')[0] + for code in df_npis_combinations_pre['Variablenname']) + npi_groups_combinations_unique = npi_groups_combinations.unique() + npi_groups_idx = [] + for code in npi_groups_combinations_unique: + npi_groups_idx.append( + list( + npi_groups_combinations + [npi_groups_combinations == code].index)) + # create hash table of main code to combination matrix + df_npis_combinations = { + npi_groups_combinations_unique[i]: np.eye(len(npi_groups_idx[i])) + for i in range(len(npi_groups_combinations_unique))} + + # run through all groups and set possible combinations according to + # read combination matrix + start_comb_matrix = list( + df_npis_combinations_pre.columns).index('Variablenname')+1 + for i in range(len(npi_groups_idx)): + df_npis_combinations[npi_groups_combinations_unique[i]] = df_npis_combinations_pre.iloc[npi_groups_idx[i], + start_comb_matrix:start_comb_matrix+len(npi_groups_idx[i])].values + if (df_npis_combinations[npi_groups_combinations_unique[i]]-np.transpose(df_npis_combinations[npi_groups_combinations_unique[i]])).max() > 0: + print('Error in input file: Please correct combination matrix input.') + + writer = pd.ExcelWriter(os.path.join( + directory, 'combinations_npis_cleanoutput.xlsx')) + # use to_excel function and specify the sheet_name and index + # to store the dataframe in specified sheet + for i in range(len(npi_groups_combinations_unique)): + codes_out = df_npis_combinations_pre.loc[npi_groups_idx[i], + 'Variablenname'].values + df_out = pd.DataFrame( + df_npis_combinations + [npi_groups_combinations_unique[i]], + columns=codes_out) + df_out.insert(0, 'Code', codes_out) + df_out.insert( + 0, 'Description (German)', + [desc + for desc in npi_codes_prior_desc + [npi_codes_prior.isin(codes_out)].values]) + df_out.to_excel( + writer, sheet_name=npi_groups_combinations_unique[i]) + writer.save() + # correct differences in codes between data sheet and explanation sheet codes_dropped = [] # no dropping for fine_resolution == 0 if fine_resolution > 0: @@ -510,48 +579,6 @@ def transform_npi_data(fine_resolution=2, [i, dd.EngEng['npiCode']]] incidence_thresholds_to_npis[incval].append(i) - # for fine_resolution == 2 deactivation of non-combinable - # incidence-dependent NPIs has to be conducted; therefore we defined a - # matrix of possible combinations of NPIs (marked with an X if combinable) - # NPIs of different main category (e.g., M01a and M04) can always be - # combined; only those of, e.g., M01a_010_3 and M01a_080_4 can exclude each - # other - if fine_resolution == 2: - df_npis_combinations_pre = pd.read_excel( - os.path.join( - directory, 'combination_npis.xlsx')) - - # rename essential columns and throw away others - column_names = ['Unnamed: ' + str(i) for i in range(3, 19)] - rename_columns = {column_names[i]: i for i in range(len(column_names))} - df_npis_combinations_pre.rename(columns=rename_columns, inplace=True) - df_npis_combinations_pre = df_npis_combinations_pre[[ - 'Variablenname'] + [i for i in range(0, 16)]] - - # extract different NPI groups and store indices of NPIs belonging - # to the different groups - npi_groups_combinations = pd.Series( - code.split('_')[0] - for code in df_npis_combinations_pre['Variablenname']) - npi_groups_combinations_unique = npi_groups_combinations.unique() - npi_groups_idx = [] - for code in npi_groups_combinations_unique: - npi_groups_idx.append( - list( - npi_groups_combinations - [npi_groups_combinations == code].index)) - # create hash table of main code to combination matrix - df_npis_combinations = { - npi_groups_combinations_unique[i]: np.zeros( - (len(npi_groups_idx[i]), - len(npi_groups_idx[i]))) - for i in range(len(npi_groups_combinations_unique))} - - # run through all groups and set possible combinations according to - # read combination matrix - for i in range(len(npi_groups_idx)): - npi_groups_idx[i] # TODO - # get county ids unique_geo_entities = geoger.get_county_ids() # check if more than the county of Eisenach would be removed with @@ -1145,7 +1172,7 @@ def main(): """! Main program entry.""" # arg_dict = gd.cli("testing") - transform_npi_data(fine_resolution=0) + transform_npi_data(fine_resolution=2) if __name__ == "__main__": From 4d1e4f7f567d089e92d35128edbf571dff10588d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Fri, 10 Jun 2022 17:58:02 +0200 Subject: [PATCH 017/104] Make combination matrix dataframe --- .../memilio/epidata/transformNPIData.py | 37 ++++++++++++------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index ca939f64bf..e96260d608 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -389,9 +389,14 @@ def transform_npi_data(fine_resolution=2, list( npi_groups_combinations [npi_groups_combinations == code].index)) - # create hash table of main code to combination matrix + # create hash table of main code to contained codes and combination matrix df_npis_combinations = { - npi_groups_combinations_unique[i]: np.eye(len(npi_groups_idx[i])) + npi_groups_combinations_unique[i]: + [ + list( + df_npis_combinations_pre['Variablenname'] + [npi_groups_idx[0]]), + np.eye(len(npi_groups_idx[i]))] for i in range(len(npi_groups_combinations_unique))} # run through all groups and set possible combinations according to @@ -399,28 +404,34 @@ def transform_npi_data(fine_resolution=2, start_comb_matrix = list( df_npis_combinations_pre.columns).index('Variablenname')+1 for i in range(len(npi_groups_idx)): - df_npis_combinations[npi_groups_combinations_unique[i]] = df_npis_combinations_pre.iloc[npi_groups_idx[i], - start_comb_matrix:start_comb_matrix+len(npi_groups_idx[i])].values - if (df_npis_combinations[npi_groups_combinations_unique[i]]-np.transpose(df_npis_combinations[npi_groups_combinations_unique[i]])).max() > 0: + codes_local = df_npis_combinations_pre.loc[npi_groups_idx[i], + 'Variablenname'].values + df_npis_combinations[npi_groups_combinations_unique[i]][1] = df_npis_combinations_pre.iloc[npi_groups_idx[i], + start_comb_matrix:start_comb_matrix+len(npi_groups_idx[i])].values + if (df_npis_combinations[npi_groups_combinations_unique[i]][1]-np.transpose(df_npis_combinations[npi_groups_combinations_unique[i]][1])).max() > 0: print('Error in input file: Please correct combination matrix input.') + # make it a dataframe to allow easy removal of code lines and rows + # if they are not used later on + df_npis_combinations[npi_groups_combinations_unique[i]][1] = pd.DataFrame( + df_npis_combinations[npi_groups_combinations_unique[i]][1], + columns=codes_local) + df_npis_combinations[npi_groups_combinations_unique[i]][1].insert( + 0, 'Code', codes_local) writer = pd.ExcelWriter(os.path.join( directory, 'combinations_npis_cleanoutput.xlsx')) # use to_excel function and specify the sheet_name and index # to store the dataframe in specified sheet for i in range(len(npi_groups_combinations_unique)): - codes_out = df_npis_combinations_pre.loc[npi_groups_idx[i], - 'Variablenname'].values - df_out = pd.DataFrame( - df_npis_combinations - [npi_groups_combinations_unique[i]], - columns=codes_out) - df_out.insert(0, 'Code', codes_out) + codes_local = df_npis_combinations[npi_groups_combinations_unique[i] + ][1].columns[1:] + df_out = df_npis_combinations[npi_groups_combinations_unique[i]][ + 1].copy() df_out.insert( 0, 'Description (German)', [desc for desc in npi_codes_prior_desc - [npi_codes_prior.isin(codes_out)].values]) + [npi_codes_prior.isin(codes_local)].values]) df_out.to_excel( writer, sheet_name=npi_groups_combinations_unique[i]) writer.save() From 4c0c8ab88830bf517f64e3588ae0429fa5459a24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Fri, 24 Jun 2022 17:49:14 +0200 Subject: [PATCH 018/104] todo combination matrix --- .../memilio/epidata/transformNPIData.py | 41 +++++++++++++++---- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index e96260d608..ace16489cc 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -418,10 +418,16 @@ def transform_npi_data(fine_resolution=2, df_npis_combinations[npi_groups_combinations_unique[i]][1].insert( 0, 'Code', codes_local) - writer = pd.ExcelWriter(os.path.join( - directory, 'combinations_npis_cleanoutput.xlsx')) # use to_excel function and specify the sheet_name and index - # to store the dataframe in specified sheet + # to store the dataframe in specified sheet if file not yet existent + # otherwise just valid results against stored sheets + write_file = False + if not os.path.exists(os.path.join( + directory, + 'combinations_npis_cleanoutput.xlsx')): + writer = pd.ExcelWriter(os.path.join( + directory, 'combinations_npis_cleanoutput.xlsx')) + write_file = True for i in range(len(npi_groups_combinations_unique)): codes_local = df_npis_combinations[npi_groups_combinations_unique[i] ][1].columns[1:] @@ -432,9 +438,22 @@ def transform_npi_data(fine_resolution=2, [desc for desc in npi_codes_prior_desc [npi_codes_prior.isin(codes_local)].values]) - df_out.to_excel( - writer, sheet_name=npi_groups_combinations_unique[i]) - writer.save() + try: + # store verified output + df_in_valid = pd.read_excel( + os.path.join( + directory, 'combinations_npis_cleanoutput.xlsx'), + sheet_name=i) + if not df_in_valid.drop(columns='Unnamed: 0').equals(df_out): + print('Error in combination matrix.') + except: + pass + + if write_file: + df_out.to_excel( + writer, sheet_name=npi_groups_combinations_unique[i]) + if write_file: + writer.save() # correct differences in codes between data sheet and explanation sheet codes_dropped = [] # no dropping for fine_resolution == 0 @@ -534,6 +553,8 @@ def transform_npi_data(fine_resolution=2, del npi_codes del npi_desc + # TODO: df_npis_combinations['M01a'][1][ isin (npis.NPI_code)] + # prepare grouping of NPIs to reduce product space of # NPI x active_from_inc (with values "incidence does not matter", and # incidence 0, 10, 35, 50, 100) to NPI @@ -579,7 +600,7 @@ def transform_npi_data(fine_resolution=2, npi_incid_start[npis.loc[i, dd.EngEng['npiCode']] ] = incid_threshold - # get all incidence thresholds + # get all incidence thresholds (This list has to be sorted) incidence_thresholds = sorted(set(npi_incid_start.values())) # create hash map from thresholds to NPI indices @@ -787,6 +808,12 @@ def transform_npi_data(fine_resolution=2, df_local_new.iloc[:, npis_idx_start + np.array(npi_indices)] \ = df_local_new.iloc[:, npis_idx_start + np.array(npi_indices)].mul(int_active, axis=0) + # TODO + # if new, dynamic NPIs for higher incidence cannot be + # combined with older, dynamic NPIs for lower indices, + # the latter have to be deactivated + # (incidence_thresholds_to_npis.keys() has to be sorted !) + # reduction of factor space NPI x incidence threshold to NPI # by max aggregation of all incidence threshold columns per NPI if fine_resolution == 1: From 5664fc7ebdb7ce87acc51a8c0f227a73147cbaa5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Fri, 24 Jun 2022 21:12:14 +0200 Subject: [PATCH 019/104] todo combination matrix --- .../memilio/epidata/transformNPIData.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index ace16489cc..91788c9e3e 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -552,8 +552,16 @@ def transform_npi_data(fine_resolution=2, npis = pd.DataFrame(npis_dummy) del npi_codes del npi_desc + # remove rows and columns of unused codes + for code in df_npis_combinations.keys(): + local_codes_used_rows = df_npis_combinations[code][1].Code.isin( + npis.NPI_code) + local_codes_used_cols = df_npis_combinations[code][1].columns.isin( + npis.NPI_code) - # TODO: df_npis_combinations['M01a'][1][ isin (npis.NPI_code)] + # overwrite item 0 since codes are stored in *.columns + df_npis_combinations[code] = df_npis_combinations[code][1].loc[local_codes_used_rows, + local_codes_used_cols].reset_index(drop=True).copy() # prepare grouping of NPIs to reduce product space of # NPI x active_from_inc (with values "incidence does not matter", and @@ -808,11 +816,13 @@ def transform_npi_data(fine_resolution=2, df_local_new.iloc[:, npis_idx_start + np.array(npi_indices)] \ = df_local_new.iloc[:, npis_idx_start + np.array(npi_indices)].mul(int_active, axis=0) - # TODO # if new, dynamic NPIs for higher incidence cannot be # combined with older, dynamic NPIs for lower indices, # the latter have to be deactivated # (incidence_thresholds_to_npis.keys() has to be sorted !) + # TODO + for code in df_npis_combinations.keys(): + df_npis_combinations[code] # reduction of factor space NPI x incidence threshold to NPI # by max aggregation of all incidence threshold columns per NPI From 4a12894c28bdbb01a27c3111291336ee7f1d18b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Mon, 11 Jul 2022 21:31:48 +0200 Subject: [PATCH 020/104] incidence_thresholds_to_npis now also contains subcode end, not only threshold --- .../memilio/epidata/transformNPIData.py | 40 +++++++++++++++---- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index 91788c9e3e..9893d4741d 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -609,15 +609,30 @@ def transform_npi_data(fine_resolution=2, ] = incid_threshold # get all incidence thresholds (This list has to be sorted) - incidence_thresholds = sorted(set(npi_incid_start.values())) + incidence_thresholds = [] + for code, threshold in npi_incid_start.items(): + if len(code.split('_')) < 3: + if not (threshold, '') in incidence_thresholds: + incidence_thresholds.append((threshold, '')) + else: + if not (threshold, '_' + code.split('_')[2]) in incidence_thresholds: + incidence_thresholds.append( + (threshold, '_' + code.split('_')[2])) + for i in range(len(incidence_thresholds)-1): + if incidence_thresholds[i][0] > incidence_thresholds[i+1][0]: + sys.exit('List needs to be sorted.') # create hash map from thresholds to NPI indices incidence_thresholds_to_npis = dict( zip(incidence_thresholds, [[] for i in range(len(incidence_thresholds))])) for i in range(len(npis)): - incval = npi_incid_start[npis.loc - [i, dd.EngEng['npiCode']]] - incidence_thresholds_to_npis[incval].append(i) + code_considered = npis.loc[i, dd.EngEng['npiCode']] + incval = npi_incid_start[code_considered] + if len(code_considered.split('_')) < 3: + incidence_thresholds_to_npis[(incval, '')].append(i) + else: + incidence_thresholds_to_npis[( + incval, '_' + code_considered.split('_')[2])].append(i) # get county ids unique_geo_entities = geoger.get_county_ids() @@ -797,7 +812,7 @@ def transform_npi_data(fine_resolution=2, # iterate through all NPIs and activate if incidence threshold # is exceeded for incidvalthrsh, npi_indices in incidence_thresholds_to_npis.items(): - if incidvalthrsh >= 0: + if incidvalthrsh[0] >= 0: local_incid = df_infec_local['Incidence'].copy() if npi_activation_delay > 0: # shift values to npi_activation_delay days later @@ -807,7 +822,7 @@ def transform_npi_data(fine_resolution=2, local_incid.iloc[:npi_activation_delay] = local_incid.iloc[0] # compare incidence against threshold int_active = ( - local_incid >= incidvalthrsh).astype(int) + local_incid >= incidvalthrsh[0]).astype(int) # multiply rows of data frame by either 1 if threshold # passed (i.e., mentioned NPI is active) or zero # (i.e., mentioned NPI is not active) @@ -821,8 +836,17 @@ def transform_npi_data(fine_resolution=2, # the latter have to be deactivated # (incidence_thresholds_to_npis.keys() has to be sorted !) # TODO - for code in df_npis_combinations.keys(): - df_npis_combinations[code] + columns_treated = list( + df_local_new.iloc + [:, npis_idx_start + np.array(npi_indices)].columns) + # check if subcodes that are incidence dependent were + # treated (they have two underscores in their codes) + if all([len(col.split('_')) == 3 for col in columns_treated]): + columns_treated_main = pd.Series( + [code_val[0:-2] for code_val in columns_treated]) + for code in df_npis_combinations.keys(): + npi_combo_submatrix_indices_changed = np.where( + columns_treated_main.isin(df_npis_combinations[code].columns) == True)[0] # reduction of factor space NPI x incidence threshold to NPI # by max aggregation of all incidence threshold columns per NPI From 85a29db64f6779bf77f4bb9c641c627cd490efca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Tue, 12 Jul 2022 22:22:27 +0200 Subject: [PATCH 021/104] exclusion loops now started --- .../memilio/epidata/transformNPIData.py | 31 ++++++++++++++++--- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index 9893d4741d..c55dd7cdd5 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -831,11 +831,32 @@ def transform_npi_data(fine_resolution=2, df_local_new.iloc[:, npis_idx_start + np.array(npi_indices)] \ = df_local_new.iloc[:, npis_idx_start + np.array(npi_indices)].mul(int_active, axis=0) - # if new, dynamic NPIs for higher incidence cannot be - # combined with older, dynamic NPIs for lower indices, - # the latter have to be deactivated - # (incidence_thresholds_to_npis.keys() has to be sorted !) - # TODO + # if new, dynamic NPIs for higher incidence cannot be + # combined with older, dynamic NPIs for lower indices, + # the latter have to be deactivated + # (incidence_thresholds_to_npis.keys() has to be sorted !) + # TODO + for incidvalthrsh, npi_indices in incidence_thresholds_to_npis.items(): + if incidvalthrsh[0] >= 0: + for code in df_npis_combinations.keys(): + code_cols = df_npis_combinations[code].columns + # iterate over subcode indices + for scidx in range(len(code_cols)-1): + subcodes_nocombi = df_npis_combinations[code].loc[scidx, + code_cols[scidx+1:]] + # only consider those codes which cannot be + # combined; for these values of 1 have to be + # set to 0 + subcodes_nocombi = list( + subcodes_nocombi[subcodes_nocombi == 0].index) + # iterate over exclusive subcodes + for subcode_excl in subcodes_nocombi: + # iterate over less strict dynamic NPIs + # i.e., where threshold is higher + for level in incidence_thresholds_to_npis.keys(): + if level[0] > incidvalthrsh[0]: + x = 15 # TODO + columns_treated = list( df_local_new.iloc [:, npis_idx_start + np.array(npi_indices)].columns) From 868d68d05ac2cb9ebb60d432047b3b933cacfbd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Thu, 14 Jul 2022 17:47:14 +0200 Subject: [PATCH 022/104] if clause for check --- .../memilio/epidata/transformNPIData.py | 68 ++++++++++--------- 1 file changed, 37 insertions(+), 31 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index c55dd7cdd5..1dc13370cf 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -811,8 +811,8 @@ def transform_npi_data(fine_resolution=2, # iterate through all NPIs and activate if incidence threshold # is exceeded - for incidvalthrsh, npi_indices in incidence_thresholds_to_npis.items(): - if incidvalthrsh[0] >= 0: + for level, npi_indices in incidence_thresholds_to_npis.items(): + if level[0] >= 0: # level[0] = incidvalthrsh local_incid = df_infec_local['Incidence'].copy() if npi_activation_delay > 0: # shift values to npi_activation_delay days later @@ -822,7 +822,7 @@ def transform_npi_data(fine_resolution=2, local_incid.iloc[:npi_activation_delay] = local_incid.iloc[0] # compare incidence against threshold int_active = ( - local_incid >= incidvalthrsh[0]).astype(int) + local_incid >= level[0]).astype(int) # level[0] = incidvalthrsh # multiply rows of data frame by either 1 if threshold # passed (i.e., mentioned NPI is active) or zero # (i.e., mentioned NPI is not active) @@ -836,38 +836,44 @@ def transform_npi_data(fine_resolution=2, # the latter have to be deactivated # (incidence_thresholds_to_npis.keys() has to be sorted !) # TODO - for incidvalthrsh, npi_indices in incidence_thresholds_to_npis.items(): - if incidvalthrsh[0] >= 0: + for level in incidence_thresholds_to_npis.keys(): + if level[0] >= 0: for code in df_npis_combinations.keys(): code_cols = df_npis_combinations[code].columns # iterate over subcode indices for scidx in range(len(code_cols)-1): - subcodes_nocombi = df_npis_combinations[code].loc[scidx, - code_cols[scidx+1:]] - # only consider those codes which cannot be - # combined; for these values of 1 have to be - # set to 0 - subcodes_nocombi = list( - subcodes_nocombi[subcodes_nocombi == 0].index) - # iterate over exclusive subcodes - for subcode_excl in subcodes_nocombi: - # iterate over less strict dynamic NPIs - # i.e., where threshold is higher - for level in incidence_thresholds_to_npis.keys(): - if level[0] > incidvalthrsh[0]: - x = 15 # TODO - - columns_treated = list( - df_local_new.iloc - [:, npis_idx_start + np.array(npi_indices)].columns) - # check if subcodes that are incidence dependent were - # treated (they have two underscores in their codes) - if all([len(col.split('_')) == 3 for col in columns_treated]): - columns_treated_main = pd.Series( - [code_val[0:-2] for code_val in columns_treated]) - for code in df_npis_combinations.keys(): - npi_combo_submatrix_indices_changed = np.where( - columns_treated_main.isin(df_npis_combinations[code].columns) == True)[0] + # check if code was used, otherwise nothing to + # exclude, i.e. no combination possible anyway. + if df_local_new.loc[:,code_cols[scidx]+level[1]].any(): + # extract codes that whose interference was not + # considered before. So only consider upper + # triangle (except main diagonal). + subcodes_nocombi = df_npis_combinations[code].loc[scidx, + code_cols[scidx+1:]] + # only consider those codes which cannot be + # combined; for these values of 1 have to be + # set to 0 + subcodes_nocombi = list( + subcodes_nocombi[subcodes_nocombi == 0].index) + # iterate over exclusive subcodes + for subcode_excl in subcodes_nocombi: + # iterate over less strict dynamic NPIs + # i.e., where threshold is higher + for level_other in incidence_thresholds_to_npis.keys(): + if level_other[0] > level[0]: + x = 15 # TODO + + # columns_treated = list( + # df_local_new.iloc + # [:, npis_idx_start + np.array(npi_indices)].columns) + # # check if subcodes that are incidence dependent were + # # treated (they have two underscores in their codes) + # if all([len(col.split('_')) == 3 for col in columns_treated]): + # columns_treated_main = pd.Series( + # [code_val[0:-2] for code_val in columns_treated]) + # for code in df_npis_combinations.keys(): + # npi_combo_submatrix_indices_changed = np.where( + # columns_treated_main.isin(df_npis_combinations[code].columns) == True)[0] # reduction of factor space NPI x incidence threshold to NPI # by max aggregation of all incidence threshold columns per NPI From b255c30c9d5bcda0c9e7158ab18fb8b4df1e6014 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Sun, 14 Aug 2022 15:04:46 +0200 Subject: [PATCH 023/104] Deactivation of exclusive NPIs --- .../memilio/epidata/transformNPIData.py | 96 ++++++++++--------- 1 file changed, 50 insertions(+), 46 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index 1dc13370cf..de947ce1a3 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -812,7 +812,7 @@ def transform_npi_data(fine_resolution=2, # iterate through all NPIs and activate if incidence threshold # is exceeded for level, npi_indices in incidence_thresholds_to_npis.items(): - if level[0] >= 0: # level[0] = incidvalthrsh + if level[0] >= 0: # level[0] = incidvalthrsh local_incid = df_infec_local['Incidence'].copy() if npi_activation_delay > 0: # shift values to npi_activation_delay days later @@ -821,8 +821,8 @@ def transform_npi_data(fine_resolution=2, # take constant value of day 0 for first delay days local_incid.iloc[:npi_activation_delay] = local_incid.iloc[0] # compare incidence against threshold - int_active = ( - local_incid >= level[0]).astype(int) # level[0] = incidvalthrsh + int_active = (local_incid >= level[0]).astype( + int) # level[0] = incidvalthrsh # multiply rows of data frame by either 1 if threshold # passed (i.e., mentioned NPI is active) or zero # (i.e., mentioned NPI is not active) @@ -831,49 +831,53 @@ def transform_npi_data(fine_resolution=2, df_local_new.iloc[:, npis_idx_start + np.array(npi_indices)] \ = df_local_new.iloc[:, npis_idx_start + np.array(npi_indices)].mul(int_active, axis=0) - # if new, dynamic NPIs for higher incidence cannot be - # combined with older, dynamic NPIs for lower indices, - # the latter have to be deactivated - # (incidence_thresholds_to_npis.keys() has to be sorted !) - # TODO - for level in incidence_thresholds_to_npis.keys(): - if level[0] >= 0: - for code in df_npis_combinations.keys(): - code_cols = df_npis_combinations[code].columns - # iterate over subcode indices - for scidx in range(len(code_cols)-1): - # check if code was used, otherwise nothing to - # exclude, i.e. no combination possible anyway. - if df_local_new.loc[:,code_cols[scidx]+level[1]].any(): - # extract codes that whose interference was not - # considered before. So only consider upper - # triangle (except main diagonal). - subcodes_nocombi = df_npis_combinations[code].loc[scidx, - code_cols[scidx+1:]] - # only consider those codes which cannot be - # combined; for these values of 1 have to be - # set to 0 - subcodes_nocombi = list( - subcodes_nocombi[subcodes_nocombi == 0].index) - # iterate over exclusive subcodes - for subcode_excl in subcodes_nocombi: - # iterate over less strict dynamic NPIs - # i.e., where threshold is higher - for level_other in incidence_thresholds_to_npis.keys(): - if level_other[0] > level[0]: - x = 15 # TODO - - # columns_treated = list( - # df_local_new.iloc - # [:, npis_idx_start + np.array(npi_indices)].columns) - # # check if subcodes that are incidence dependent were - # # treated (they have two underscores in their codes) - # if all([len(col.split('_')) == 3 for col in columns_treated]): - # columns_treated_main = pd.Series( - # [code_val[0:-2] for code_val in columns_treated]) - # for code in df_npis_combinations.keys(): - # npi_combo_submatrix_indices_changed = np.where( - # columns_treated_main.isin(df_npis_combinations[code].columns) == True)[0] + # if new, dynamic NPIs for higher incidence (more restrictions, + # i.e., stricter) cannot be combined with previous, dynamic + # NPIs for lower indices (less restrictions, less strict), + # the latter have to be deactivated + # (incidence_thresholds_to_npis.keys() has to be sorted !) + # TODO + levels_exclusion = list(reversed(incidence_thresholds_to_npis.keys()))[ + 0:-1] # level<0 means non-incidence dependent and always active + for level in levels_exclusion: + level_lower = [lev for lev in levels_exclusion + if lev[0] < level[0]] + for code in df_npis_combinations.keys(): + code_cols = df_npis_combinations[code].columns + # iterate over subcode indices + for scidx in range(len(code_cols)-1): + # check if code was used, otherwise nothing to + # exclude, i.e. no combination possible anyway. + indicator_code_active = df_local_new.loc[:, + code_cols + [scidx] + + level + [1]] + indicator_code_active_idx = np.where( + indicator_code_active > 0)[0] + if len(indicator_code_active_idx) > 0: + # extract codes + subcodes_nocombi = df_npis_combinations[code].loc[scidx, :] + # only consider those codes which cannot be + # combined; for these values of 1 have to be + # set to 0 + subcodes_nocombi = list( + subcodes_nocombi + [subcodes_nocombi == 0].index) + # iterate over exclusive subcodes + for subcode_excl in subcodes_nocombi: + # iterate over less strict dynamic NPIs + # i.e., where threshold is higher + for level_other in level_lower: + # deactivate potential NPIs (with code: + # subcode_excl + level_other[1]) on days + # where NPI code_cols[scidx] + level[1] + # is active + if df_local_new.loc[indicator_code_active_idx, subcode_excl + level_other[1]].any(): + print('Resetting potential NPI') + # df_npis_old[df_npis_old.ID_County==5315].iloc[[14,34],indicator_code_active_idx] + df_local_new.loc[indicator_code_active_idx, + subcode_excl + level_other[1]] = 0 # reduction of factor space NPI x incidence threshold to NPI # by max aggregation of all incidence threshold columns per NPI From df1480f0073d14fe8552b1d1b8668f5a8daffdd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Tue, 16 Aug 2022 18:15:38 +0200 Subject: [PATCH 024/104] Correction and extension for delayed NPI implementation or lifting --- .../memilio/epidata/transformNPIData.py | 55 +++++++++++++++---- 1 file changed, 43 insertions(+), 12 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index de947ce1a3..c31b641a35 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -297,8 +297,11 @@ def transform_npi_data(fine_resolution=2, # defines delay in number of days between exceeding # incidence threshold and NPI getting active npi_activation_delay = 0 + npi_lifting_delay = 0 print('Using a delay of NPI activation of ' + str(npi_activation_delay) + ' days.') + print('Using a delay of NPI lifting of ' + + str(npi_lifting_delay) + ' days.') try: df_npis_old = pd.read_csv( @@ -712,7 +715,7 @@ def transform_npi_data(fine_resolution=2, counters = np.zeros(5) # time counter for output only countyidx = 0 # unique_geo_entities: - for countyID in [5315, 1001, 9162, 16071, 11000, 1060, 5566]: + for countyID in [1001]:#[5315, 1001, 9162, 16071, 11000, 1060, 5566]: cid = 0 countyidx += 1 @@ -722,6 +725,7 @@ def transform_npi_data(fine_resolution=2, ) pop_local = df_population.loc[df_population[dd.EngEng['idCounty']] == countyID, dd.EngEng['population']].values[0] + # consider difference between current day and day-7 to compute incidence incidence_local = df_infec_local[dd.EngEng['confirmed']].diff( periods=7).fillna(df_infec_local[dd.EngEng['confirmed']]) df_infec_local['Incidence'] = incidence_local / pop_local * 100000 @@ -814,15 +818,36 @@ def transform_npi_data(fine_resolution=2, for level, npi_indices in incidence_thresholds_to_npis.items(): if level[0] >= 0: # level[0] = incidvalthrsh local_incid = df_infec_local['Incidence'].copy() + + if npi_lifting_delay > 0: + # take maximum over last 'npi_lifting_delay' days to + # lift NPI only after incidence is below threshold for + # 'npi_lifting_delay' number of days + local_incid_max = local_incid.rolling(npi_lifting_delay).max().fillna(local_incid) + + # compare transformed incidence against threshold + # (level[0] = incidvalthrsh) + int_active = (local_incid_max >= level[0]).astype(int) + else: + int_active = (local_incid >= level[0]).astype(int) + if npi_activation_delay > 0: - # shift values to npi_activation_delay days later - local_incid.iloc[npi_activation_delay: - ] = local_incid.iloc[0:-npi_activation_delay].values - # take constant value of day 0 for first delay days - local_incid.iloc[:npi_activation_delay] = local_incid.iloc[0] - # compare incidence against threshold - int_active = (local_incid >= level[0]).astype( - int) # level[0] = incidvalthrsh + # take minimum over last 'npi_activation_delay' days to + # enforce NPI only after incidence is over threshold for + # 'npi_activation_delay' number of days + local_incid_min = local_incid.rolling(npi_activation_delay).min().fillna(local_incid) + + int_potential_begin_list = np.where((local_incid_min >= level[0]).astype(int))[0] + + # correct start date for first implementation of NPI + int_active[range((int_potential_begin_list[0]-npi_lifting_delay+1),int_potential_begin_list[0])] = 0 + # correct start dates for further implementations + # and account for oscilating incidence which does + # not directly yield lifting of the implementation + for i in range(0, len(int_potential_begin_list)-1): + if int_potential_begin_list[i+1] - int_potential_begin_list[i] > npi_lifting_delay: + int_active[range((int_potential_begin_list[i+1]-npi_lifting_delay+1),int_potential_begin_list[i+1])] = 0 + # multiply rows of data frame by either 1 if threshold # passed (i.e., mentioned NPI is active) or zero # (i.e., mentioned NPI is not active) @@ -839,6 +864,7 @@ def transform_npi_data(fine_resolution=2, # TODO levels_exclusion = list(reversed(incidence_thresholds_to_npis.keys()))[ 0:-1] # level<0 means non-incidence dependent and always active + print('\n') for level in levels_exclusion: level_lower = [lev for lev in levels_exclusion if lev[0] < level[0]] @@ -874,10 +900,15 @@ def transform_npi_data(fine_resolution=2, # where NPI code_cols[scidx] + level[1] # is active if df_local_new.loc[indicator_code_active_idx, subcode_excl + level_other[1]].any(): - print('Resetting potential NPI') + print('Deactivating for ' + 'County ' + str(countyID) + ' and Incidence ' + str(level_other[0])) + print('\t' + npi_codes_prior_desc[npi_codes_prior[npi_codes_prior==subcode_excl + level_other[1]].index].values[0]) + print('Due to Incidence > ' + str(level[0]) + ' and NPI ') + print('\t' + npi_codes_prior_desc[npi_codes_prior[npi_codes_prior==code_cols[scidx] + level[1]].index].values[0]) + print('\n') # df_npis_old[df_npis_old.ID_County==5315].iloc[[14,34],indicator_code_active_idx] - df_local_new.loc[indicator_code_active_idx, - subcode_excl + level_other[1]] = 0 + df_local_new.loc[indicator_code_active_idx, + subcode_excl + level_other[1]] = 0 + x=15 # reduction of factor space NPI x incidence threshold to NPI # by max aggregation of all incidence threshold columns per NPI From 977df04842d76dfcddf0f15a7d69f4abc509c16d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Thu, 18 Aug 2022 18:53:24 +0200 Subject: [PATCH 025/104] Further correction of NPI start and end date for delays with extensive example --- .../memilio/epidata/transformNPIData.py | 125 ++++++++++++------ 1 file changed, 87 insertions(+), 38 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index c31b641a35..a620e1cf5a 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -182,6 +182,10 @@ def flatten_hierarch_clustering(corr_mat, cluster_hierarch, weights): def validate(df_npis_old, df_npis, df_infec_rki, countyID, npiCode, start_npi_cols, npi_incid_start, start_date_validation, end_date_validation, fine_resolution): + """! Validates the transformed NPI data based on read in NPI data list. + Also works for incidence-dependent NPIs as long as no activation or lifting + delay is used. + """ if fine_resolution == 1: npiCodes = [npiCode + code @@ -296,8 +300,8 @@ def transform_npi_data(fine_resolution=2, if fine_resolution > 0: # defines delay in number of days between exceeding # incidence threshold and NPI getting active - npi_activation_delay = 0 - npi_lifting_delay = 0 + npi_activation_delay = 1 + npi_lifting_delay = 3 print('Using a delay of NPI activation of ' + str(npi_activation_delay) + ' days.') print('Using a delay of NPI lifting of ' + @@ -715,7 +719,7 @@ def transform_npi_data(fine_resolution=2, counters = np.zeros(5) # time counter for output only countyidx = 0 # unique_geo_entities: - for countyID in [1001]:#[5315, 1001, 9162, 16071, 11000, 1060, 5566]: + for countyID in [5315, 1001, 9162, 16071, 11000, 1060, 5566]: cid = 0 countyidx += 1 @@ -819,34 +823,81 @@ def transform_npi_data(fine_resolution=2, if level[0] >= 0: # level[0] = incidvalthrsh local_incid = df_infec_local['Incidence'].copy() - if npi_lifting_delay > 0: - # take maximum over last 'npi_lifting_delay' days to - # lift NPI only after incidence is below threshold for - # 'npi_lifting_delay' number of days - local_incid_max = local_incid.rolling(npi_lifting_delay).max().fillna(local_incid) - - # compare transformed incidence against threshold - # (level[0] = incidvalthrsh) - int_active = (local_incid_max >= level[0]).astype(int) - else: - int_active = (local_incid >= level[0]).astype(int) - - if npi_activation_delay > 0: - # take minimum over last 'npi_activation_delay' days to - # enforce NPI only after incidence is over threshold for - # 'npi_activation_delay' number of days - local_incid_min = local_incid.rolling(npi_activation_delay).min().fillna(local_incid) - - int_potential_begin_list = np.where((local_incid_min >= level[0]).astype(int))[0] - - # correct start date for first implementation of NPI - int_active[range((int_potential_begin_list[0]-npi_lifting_delay+1),int_potential_begin_list[0])] = 0 - # correct start dates for further implementations - # and account for oscilating incidence which does - # not directly yield lifting of the implementation - for i in range(0, len(int_potential_begin_list)-1): - if int_potential_begin_list[i+1] - int_potential_begin_list[i] > npi_lifting_delay: - int_active[range((int_potential_begin_list[i+1]-npi_lifting_delay+1),int_potential_begin_list[i+1])] = 0 + # If npi_lifting_delay=0, then local_incid equals + # local_incid.rolling(npi_lifting_delay+1) + # take maximum over last 'npi_lifting_delay+1' days to + # lift NPI only after incidence is below threshold for + # 'npi_lifting_delay' number of days + # Example why we use npi_lifting_delay+1: + # Incidences [4, 2, 2, 2], npi_lifting_delay = 2, + # Consideration: threshold: 3 + # pd.Series([4, 2, 2, 2]).rolling(2).max().nan(...) + # = [4, 4, 2, 2, 2] + # would then lift NPIs on day 3 while incidence is + # below 4 for two days only ON day 3, so NPI would + # be lifted on day 4. + # Therefore, use 'npi_lifting_delay+1', to lift AFTER + # npi_lifting_delay many days and not on this day + local_incid_max = local_incid.rolling(npi_lifting_delay+1).max().fillna(local_incid) + + # compare transformed incidence against threshold + # (level[0] = incidvalthrsh) + int_active = (local_incid_max >= level[0]).astype(int) + + # take minimum over last 'npi_activation_delay' days to + # enforce NPI only after incidence is over threshold for + # 'npi_activation_delay' number of days + local_incid_min = local_incid.rolling(npi_activation_delay+1).min().fillna(local_incid) + + # Correct start dates of NPIs + # Example: + # inc=pd.Series([4,2,4,2,2,4,4,2,4,2,2,2,2]) + # Threshold (i.e., level[0]): 3, NPI activation delay: 1, + # NPI lifting delay: 3. + # NPI should then start on 7th day and be lifted on 12th day + # + # We have + # [1,0,1,1,1,1,1,1,1,1,1,1,0] for inc.rolling(3+1).max() >= 3 + # which is 'int_active' and indicates correct end date but + # wrong start date. + # I.e., NPI is lifted after three days below treshold 3 but + # would have started on day 1 where the threshold 3 is only + # exceeded for the first time and not yet for one full day. + # To be more precise, activation_delay=N means that the NPI + # starts AFTER N days where incidence is exceeded and not + # ON the Nth day. + # We also have + # [1,0,0,0,0,0,1,0,0,0,0,0,0] for inc.rolling(1+1).min() >= 3 + # and + # int_potential_begin_list = [0,6] + # We then set the empty set of int_active[0:0] to zero + # In the for loop: + # Then, since 6 - 0 > npi_lifting_delay + 1 = 4 + # int_active[2,3,4,5] = 0 + # + # Note that in case of the example + # inc=pd.Series([4,4,4,2,2,4,4,2,4,2,2,2,2]) + # int_potential_begin_list = [0,1,2,6] + # and NPI would correctly start on day 2 and would not have + # been deactivated/lifted while incidence goes below 3 for + # days 4 and 5. + # + # Note 2: There may be some negligible unwanted effects in + # activation/deactivation for the first days of the full + # time series since we cannot compute min(), max() + # over the last days there (see also use of fillna(...)) + int_potential_begin_list = np.where((local_incid_min >= level[0]).astype(int))[0] + # Correct start date for first implementation of NPI + # (take npi_lifting_delay is needed here instead of + # npi_activation_delay since int_active is computed via + # maximum over npi_lifting_delay+1 many days) + int_active[range(max((int_potential_begin_list[0]-npi_lifting_delay-1),0), int_potential_begin_list[0])] = 0 + # Correct start dates for further implementations + # and account for oscilating incidence which does + # not directly yield lifting of the implementation + for i in range(0, len(int_potential_begin_list)-1): + if int_potential_begin_list[i+1] - int_potential_begin_list[i] > npi_lifting_delay + 1: + int_active[range((int_potential_begin_list[i+1]-npi_lifting_delay-1),int_potential_begin_list[i+1])] = 0 # multiply rows of data frame by either 1 if threshold # passed (i.e., mentioned NPI is active) or zero @@ -861,7 +912,6 @@ def transform_npi_data(fine_resolution=2, # NPIs for lower indices (less restrictions, less strict), # the latter have to be deactivated # (incidence_thresholds_to_npis.keys() has to be sorted !) - # TODO levels_exclusion = list(reversed(incidence_thresholds_to_npis.keys()))[ 0:-1] # level<0 means non-incidence dependent and always active print('\n') @@ -900,15 +950,14 @@ def transform_npi_data(fine_resolution=2, # where NPI code_cols[scidx] + level[1] # is active if df_local_new.loc[indicator_code_active_idx, subcode_excl + level_other[1]].any(): - print('Deactivating for ' + 'County ' + str(countyID) + ' and Incidence ' + str(level_other[0])) - print('\t' + npi_codes_prior_desc[npi_codes_prior[npi_codes_prior==subcode_excl + level_other[1]].index].values[0]) - print('Due to Incidence > ' + str(level[0]) + ' and NPI ') - print('\t' + npi_codes_prior_desc[npi_codes_prior[npi_codes_prior==code_cols[scidx] + level[1]].index].values[0]) - print('\n') + # print('Deactivating for ' + 'County ' + str(countyID) + ' and Incidence ' + str(level_other[0])) + # print('\t' + npi_codes_prior_desc[npi_codes_prior[npi_codes_prior==subcode_excl + level_other[1]].index].values[0]) + # print('Due to Incidence > ' + str(level[0]) + ' and NPI ') + # print('\t' + npi_codes_prior_desc[npi_codes_prior[npi_codes_prior==code_cols[scidx] + level[1]].index].values[0]) + # print('\n') # df_npis_old[df_npis_old.ID_County==5315].iloc[[14,34],indicator_code_active_idx] df_local_new.loc[indicator_code_active_idx, subcode_excl + level_other[1]] = 0 - x=15 # reduction of factor space NPI x incidence threshold to NPI # by max aggregation of all incidence threshold columns per NPI From da29e98fe7b317be14710a4c2bd05534cbe24efa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Thu, 18 Aug 2022 18:55:28 +0200 Subject: [PATCH 026/104] correct formatting; no other change --- .../memilio/epidata/transformNPIData.py | 61 ++++++++++--------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index a620e1cf5a..1985b55480 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -305,7 +305,7 @@ def transform_npi_data(fine_resolution=2, print('Using a delay of NPI activation of ' + str(npi_activation_delay) + ' days.') print('Using a delay of NPI lifting of ' + - str(npi_lifting_delay) + ' days.') + str(npi_lifting_delay) + ' days.') try: df_npis_old = pd.read_csv( @@ -325,7 +325,7 @@ def transform_npi_data(fine_resolution=2, 'M24_030', 'M24_040', 'M24_050', 'M24_060'] for tcode in test_codes: for i in [''] + ["_" + str(i) for i in range(1, 6)]: - if(df_npis_old[df_npis_old[dd.EngEng['npiCode']] == tcode+i].iloc[:, 6:].max().max() > 0): + if (df_npis_old[df_npis_old[dd.EngEng['npiCode']] == tcode+i].iloc[:, 6:].max().max() > 0): print(tcode+i + " used.") # end check @@ -823,22 +823,23 @@ def transform_npi_data(fine_resolution=2, if level[0] >= 0: # level[0] = incidvalthrsh local_incid = df_infec_local['Incidence'].copy() - # If npi_lifting_delay=0, then local_incid equals + # If npi_lifting_delay=0, then local_incid equals # local_incid.rolling(npi_lifting_delay+1) # take maximum over last 'npi_lifting_delay+1' days to # lift NPI only after incidence is below threshold for - # 'npi_lifting_delay' number of days - # Example why we use npi_lifting_delay+1: + # 'npi_lifting_delay' number of days + # Example why we use npi_lifting_delay+1: # Incidences [4, 2, 2, 2], npi_lifting_delay = 2, # Consideration: threshold: 3 # pd.Series([4, 2, 2, 2]).rolling(2).max().nan(...) # = [4, 4, 2, 2, 2] - # would then lift NPIs on day 3 while incidence is + # would then lift NPIs on day 3 while incidence is # below 4 for two days only ON day 3, so NPI would - # be lifted on day 4. + # be lifted on day 4. # Therefore, use 'npi_lifting_delay+1', to lift AFTER # npi_lifting_delay many days and not on this day - local_incid_max = local_incid.rolling(npi_lifting_delay+1).max().fillna(local_incid) + local_incid_max = local_incid.rolling( + npi_lifting_delay+1).max().fillna(local_incid) # compare transformed incidence against threshold # (level[0] = incidvalthrsh) @@ -847,57 +848,61 @@ def transform_npi_data(fine_resolution=2, # take minimum over last 'npi_activation_delay' days to # enforce NPI only after incidence is over threshold for # 'npi_activation_delay' number of days - local_incid_min = local_incid.rolling(npi_activation_delay+1).min().fillna(local_incid) + local_incid_min = local_incid.rolling( + npi_activation_delay+1).min().fillna(local_incid) - # Correct start dates of NPIs + # Correct start dates of NPIs # Example: # inc=pd.Series([4,2,4,2,2,4,4,2,4,2,2,2,2]) - # Threshold (i.e., level[0]): 3, NPI activation delay: 1, + # Threshold (i.e., level[0]): 3, NPI activation delay: 1, # NPI lifting delay: 3. # NPI should then start on 7th day and be lifted on 12th day # # We have # [1,0,1,1,1,1,1,1,1,1,1,1,0] for inc.rolling(3+1).max() >= 3 - # which is 'int_active' and indicates correct end date but + # which is 'int_active' and indicates correct end date but # wrong start date. # I.e., NPI is lifted after three days below treshold 3 but - # would have started on day 1 where the threshold 3 is only + # would have started on day 1 where the threshold 3 is only # exceeded for the first time and not yet for one full day. # To be more precise, activation_delay=N means that the NPI - # starts AFTER N days where incidence is exceeded and not + # starts AFTER N days where incidence is exceeded and not # ON the Nth day. # We also have # [1,0,0,0,0,0,1,0,0,0,0,0,0] for inc.rolling(1+1).min() >= 3 # and # int_potential_begin_list = [0,6] # We then set the empty set of int_active[0:0] to zero - # In the for loop: + # In the for loop: # Then, since 6 - 0 > npi_lifting_delay + 1 = 4 - # int_active[2,3,4,5] = 0 + # int_active[2,3,4,5] = 0 # - # Note that in case of the example + # Note that in case of the example # inc=pd.Series([4,4,4,2,2,4,4,2,4,2,2,2,2]) # int_potential_begin_list = [0,1,2,6] # and NPI would correctly start on day 2 and would not have # been deactivated/lifted while incidence goes below 3 for # days 4 and 5. # - # Note 2: There may be some negligible unwanted effects in + # Note 2: There may be some negligible unwanted effects in # activation/deactivation for the first days of the full - # time series since we cannot compute min(), max() + # time series since we cannot compute min(), max() # over the last days there (see also use of fillna(...)) - int_potential_begin_list = np.where((local_incid_min >= level[0]).astype(int))[0] + int_potential_begin_list = np.where( + (local_incid_min >= level[0]).astype(int))[0] # Correct start date for first implementation of NPI - # (take npi_lifting_delay is needed here instead of - # npi_activation_delay since int_active is computed via + # (take npi_lifting_delay is needed here instead of + # npi_activation_delay since int_active is computed via # maximum over npi_lifting_delay+1 many days) - int_active[range(max((int_potential_begin_list[0]-npi_lifting_delay-1),0), int_potential_begin_list[0])] = 0 + int_active[range(max( + (int_potential_begin_list[0]-npi_lifting_delay-1), 0), int_potential_begin_list[0])] = 0 # Correct start dates for further implementations # and account for oscilating incidence which does # not directly yield lifting of the implementation for i in range(0, len(int_potential_begin_list)-1): if int_potential_begin_list[i+1] - int_potential_begin_list[i] > npi_lifting_delay + 1: - int_active[range((int_potential_begin_list[i+1]-npi_lifting_delay-1),int_potential_begin_list[i+1])] = 0 + int_active[range( + (int_potential_begin_list[i+1]-npi_lifting_delay-1), int_potential_begin_list[i+1])] = 0 # multiply rows of data frame by either 1 if threshold # passed (i.e., mentioned NPI is active) or zero @@ -953,11 +958,11 @@ def transform_npi_data(fine_resolution=2, # print('Deactivating for ' + 'County ' + str(countyID) + ' and Incidence ' + str(level_other[0])) # print('\t' + npi_codes_prior_desc[npi_codes_prior[npi_codes_prior==subcode_excl + level_other[1]].index].values[0]) # print('Due to Incidence > ' + str(level[0]) + ' and NPI ') - # print('\t' + npi_codes_prior_desc[npi_codes_prior[npi_codes_prior==code_cols[scidx] + level[1]].index].values[0]) + # print('\t' + npi_codes_prior_desc[npi_codes_prior[npi_codes_prior==code_cols[scidx] + level[1]].index].values[0]) # print('\n') # df_npis_old[df_npis_old.ID_County==5315].iloc[[14,34],indicator_code_active_idx] df_local_new.loc[indicator_code_active_idx, - subcode_excl + level_other[1]] = 0 + subcode_excl + level_other[1]] = 0 # reduction of factor space NPI x incidence threshold to NPI # by max aggregation of all incidence threshold columns per NPI @@ -1024,7 +1029,7 @@ def transform_npi_data(fine_resolution=2, npiCode + subcode, start_npi_cols, npi_incid_start, start_date_validation, end_date_validation, fine_resolution) - if(a != b): + if (a != b): print('Error in NPI activation computation') else: print(a, b, a == b) @@ -1041,7 +1046,7 @@ def transform_npi_data(fine_resolution=2, df_infec_rki, countyID, npiCode, start_npi_cols, npi_incid_start, start_date_validation, end_date_validation, fine_resolution) - if(a != b): + if (a != b): print('Error in NPI activation computation') else: print(a, b, a == b) From 7f11dbfc8afa569c561a7a8338eabdc388cde81f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Thu, 18 Aug 2022 19:27:59 +0200 Subject: [PATCH 027/104] remove printouts and replace deprecated pandas append --- .../memilio/epidata/transformNPIData.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index 1985b55480..da4b7e187b 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -919,7 +919,6 @@ def transform_npi_data(fine_resolution=2, # (incidence_thresholds_to_npis.keys() has to be sorted !) levels_exclusion = list(reversed(incidence_thresholds_to_npis.keys()))[ 0:-1] # level<0 means non-incidence dependent and always active - print('\n') for level in levels_exclusion: level_lower = [lev for lev in levels_exclusion if lev[0] < level[0]] @@ -959,8 +958,8 @@ def transform_npi_data(fine_resolution=2, # print('\t' + npi_codes_prior_desc[npi_codes_prior[npi_codes_prior==subcode_excl + level_other[1]].index].values[0]) # print('Due to Incidence > ' + str(level[0]) + ' and NPI ') # print('\t' + npi_codes_prior_desc[npi_codes_prior[npi_codes_prior==code_cols[scidx] + level[1]].index].values[0]) + # print(list(df_local_new.loc[indicator_code_active_idx,'Date'])) # print('\n') - # df_npis_old[df_npis_old.ID_County==5315].iloc[[14,34],indicator_code_active_idx] df_local_new.loc[indicator_code_active_idx, subcode_excl + level_other[1]] = 0 @@ -981,8 +980,9 @@ def transform_npi_data(fine_resolution=2, ### ### start_time = time.perf_counter() - df_npis = df_npis.append(df_local_new, - ignore_index=True).copy() + + df_npis = pd.concat([df_npis, df_local_new], ignore_index=True) + counters[cid] += time.perf_counter()-start_time cid += 1 @@ -1014,7 +1014,7 @@ def transform_npi_data(fine_resolution=2, pass #### start validation #### - if fine_resolution == 2: + if fine_resolution == 2 and (npi_activation_delay + npi_lifting_delay == 0): start_date_validation = datetime(2020, 3, 1) end_date_validation = datetime(2022, 2, 15) @@ -1031,8 +1031,7 @@ def transform_npi_data(fine_resolution=2, fine_resolution) if (a != b): print('Error in NPI activation computation') - else: - print(a, b, a == b) + print(a, b, a - b) elif fine_resolution == 1: start_date_validation = datetime(2020, 3, 1) @@ -1048,7 +1047,6 @@ def transform_npi_data(fine_resolution=2, end_date_validation, fine_resolution) if (a != b): print('Error in NPI activation computation') - else: print(a, b, a == b) #### end validation #### From 671e9a9585b8b396f5477c52db76ed46e79528ab Mon Sep 17 00:00:00 2001 From: patricklnz Date: Mon, 17 Oct 2022 13:29:52 +0200 Subject: [PATCH 028/104] fix errors --- .../memilio/epidata/transformNPIData.py | 29 +++++++++---------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index da4b7e187b..e5d694f961 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -17,17 +17,15 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################# -from datetime import datetime, timedelta, date +from datetime import datetime, timedelta import sys import time import os import pandas as pd import numpy as np import matplotlib.pyplot as plt -from matplotlib import colors from scipy.spatial.distance import pdist from scipy.cluster import hierarchy -from sklearn.cluster import KMeans from memilio.epidata import getDataIntoPandasDataFrame as gd from memilio.epidata import geoModificationGermany as geoger @@ -347,7 +345,7 @@ def transform_npi_data(fine_resolution=2, df_npis_desc = pd.read_excel( os.path.join( directory, 'datensatzbeschreibung_massnahmen.xlsx'), - sheet_name=2) + sheet_name=2, engine = 'openpyxl') else: df_npis_desc = pd.read_excel( os.path.join( @@ -372,7 +370,7 @@ def transform_npi_data(fine_resolution=2, if fine_resolution == 2: df_npis_combinations_pre = pd.read_excel( os.path.join( - directory, 'combination_npis.xlsx')) + directory, 'combination_npis.xlsx'), engine = 'openpyxl') # rename essential columns and throw away others column_names = ['Unnamed: ' + str(i) for i in range(3, 19)] @@ -450,7 +448,7 @@ def transform_npi_data(fine_resolution=2, df_in_valid = pd.read_excel( os.path.join( directory, 'combinations_npis_cleanoutput.xlsx'), - sheet_name=i) + sheet_name=i, engine = 'openpyxl') if not df_in_valid.drop(columns='Unnamed: 0').equals(df_out): print('Error in combination matrix.') except: @@ -601,7 +599,7 @@ def transform_npi_data(fine_resolution=2, if fine_resolution > 0: npi_incid_start = dict() for i in range(len(npis)): - incid_threshold = 1e10 + incid_threshold = 1e5 if npis.loc[i, dd.EngEng['desc']].split(' ')[0] == 'Unabhängig': # set -1 for incidence-independent NPIs incid_threshold = -1 @@ -648,7 +646,7 @@ def transform_npi_data(fine_resolution=2, counties_removed = df_npis_old[ ~df_npis_old[dd.EngEng['idCounty']].isin(unique_geo_entities)][ dd.EngEng['idCounty']].unique() - if len(counties_removed) == 1 and counties_removed[0] != 16056: + if list(counties_removed) != [16056]: sys.exit('Error. Other counties than that of Eisenach were removed.') # remove rows for Eisenach df_npis_old = df_npis_old[df_npis_old[dd.EngEng['idCounty']].isin( @@ -684,7 +682,7 @@ def transform_npi_data(fine_resolution=2, # NPIs were active if fine_resolution > 0: df_infec_rki = pd.read_json(os.path.join( - directory, 'all_county_all_dates_repdate_rki.json')) + directory, 'cases_all_county_all_dates_repdate.json')) df_infec_rki[dd.EngEng['date']] = pd.to_datetime( df_infec_rki[dd.EngEng['date']]) df_population = pd.read_json( @@ -731,8 +729,8 @@ def transform_npi_data(fine_resolution=2, == countyID, dd.EngEng['population']].values[0] # consider difference between current day and day-7 to compute incidence incidence_local = df_infec_local[dd.EngEng['confirmed']].diff( - periods=7).fillna(df_infec_local[dd.EngEng['confirmed']]) - df_infec_local['Incidence'] = incidence_local / pop_local * 100000 + periods=7).fillna(df_infec_local[dd.EngEng['confirmed']]) / pop_local * 100000 + df_infec_local['Incidence'] = incidence_local # set to main data frame df_infec_rki.loc[df_infec_rki[dd.EngEng['idCounty']] == @@ -752,8 +750,7 @@ def transform_npi_data(fine_resolution=2, for i in df_local_old[dd.EngEng['npiCode']]] # get list of NPI codes, ordered as the rows in the current data frame - npi_codes_ordered_as_rows = df_local_old[dd.EngEng['npiCode']][ - npi_rows].to_list() + npi_codes_ordered_as_rows = npis_final['NPI_code'].to_list() # get indices of rows for the NPI codes as in the sorted npi_codes list # may be superfluous if NPI code rows are sorted correctly @@ -1066,11 +1063,11 @@ def transform_npi_data(fine_resolution=2, # df_validation.iloc[:, start_npi_cols - 1:] != df_npis.iloc # [:, start_npi_cols - 1:])[0]) > 0: # print('Error in file writing/reading') + npi_codes_considered = [] #which codes? + analyze_npi_data(True, True, fine_resolution, npis, directory, file_format, npi_codes_considered) -def analyze_npi_data( - read_data=dd.defaultDict['read_data'], - make_plot=dd.defaultDict['make_plot']): +def analyze_npi_data(read_data, make_plot, fine_resolution, npis, directory, file_format, npi_codes_considered): if not read_data: x = 15 From 65596e75cef9e4513198f5c10dfc0795bee7bc58 Mon Sep 17 00:00:00 2001 From: patricklnz Date: Fri, 4 Nov 2022 10:15:18 +0100 Subject: [PATCH 029/104] rework activation/lifting of incidence dependent NPIs --- .../memilio/epidata/transformNPIData.py | 142 ++++++++---------- 1 file changed, 59 insertions(+), 83 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index e5d694f961..2e55f54fc4 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -298,8 +298,11 @@ def transform_npi_data(fine_resolution=2, if fine_resolution > 0: # defines delay in number of days between exceeding # incidence threshold and NPI getting active - npi_activation_delay = 1 - npi_lifting_delay = 3 + # delay = 0 means only one day is considered (=no delay) + npi_activation_delay = 0 + npi_lifting_delay = 4 # for NRW, BW + # 2 for bayern + # we use npi_lifting_delay = 4 as this is the most common print('Using a delay of NPI activation of ' + str(npi_activation_delay) + ' days.') print('Using a delay of NPI lifting of ' + @@ -737,8 +740,10 @@ def transform_npi_data(fine_resolution=2, countyID, 'Incidence'] = df_infec_local['Incidence'].values # cut infection information at start_date_new and end_date_new - df_infec_local = df_infec_local.loc[(df_infec_local[dd.EngEng['date']] >= start_date_new) & ( - df_infec_local[dd.EngEng['date']] <= end_date_new), :].reset_index() + df_infec_local = df_infec_local[df_infec_local + [dd.EngEng['date']] >= + start_date_new][ + df_infec_local[dd.EngEng['date']] <= end_date_new].reset_index() # get county-local data frame start_time = time.perf_counter() @@ -819,87 +824,58 @@ def transform_npi_data(fine_resolution=2, for level, npi_indices in incidence_thresholds_to_npis.items(): if level[0] >= 0: # level[0] = incidvalthrsh local_incid = df_infec_local['Incidence'].copy() - - # If npi_lifting_delay=0, then local_incid equals - # local_incid.rolling(npi_lifting_delay+1) - # take maximum over last 'npi_lifting_delay+1' days to - # lift NPI only after incidence is below threshold for - # 'npi_lifting_delay' number of days - # Example why we use npi_lifting_delay+1: - # Incidences [4, 2, 2, 2], npi_lifting_delay = 2, - # Consideration: threshold: 3 - # pd.Series([4, 2, 2, 2]).rolling(2).max().nan(...) - # = [4, 4, 2, 2, 2] - # would then lift NPIs on day 3 while incidence is - # below 4 for two days only ON day 3, so NPI would - # be lifted on day 4. - # Therefore, use 'npi_lifting_delay+1', to lift AFTER - # npi_lifting_delay many days and not on this day - local_incid_max = local_incid.rolling( - npi_lifting_delay+1).max().fillna(local_incid) - - # compare transformed incidence against threshold - # (level[0] = incidvalthrsh) - int_active = (local_incid_max >= level[0]).astype(int) - - # take minimum over last 'npi_activation_delay' days to - # enforce NPI only after incidence is over threshold for - # 'npi_activation_delay' number of days - local_incid_min = local_incid.rolling( - npi_activation_delay+1).min().fillna(local_incid) - - # Correct start dates of NPIs + # NPI can only be activated or liftet the day AFTER + # incidence is below/over threshold for N days. The + # incidence on day N only effects the NPI on day N+1 and + # NOT ON day N. Therefore we shift the incidence one day forward + # to match the indices of our dataframe df_local_new so that + # the NPIs can be calculated on the respective day. + # + # Example (threshold=3.5): + # local_incid=pd.Series([4,2,4,2,2,4,4,2,4,2,2,2,2]) + # Yesterdays incidence is over the threshold on following days: + # [?,1,0,1,0,0,1,1,0,1,0,0,0] + # The first day is not known and always set to the first days value. + # [1,1,0,1,0,0,1,1,0,1,0,0,0] + + # First get a Series with 0 for yesterdays incidence + # is below threshold and 1 for incidence over threshold + yesterdays_incid_over_threshold = (local_incid.shift( + 1).fillna(local_incid[0]) > level[0]).astype(int) + + # If incidence is above threshold for + # 1+npi_activation_delay days, the NPI gets activated. + # Similarly, if incidence is below threshold for + # 1+npi_lifting_delay days, the NPI is lifted. + # # Example: - # inc=pd.Series([4,2,4,2,2,4,4,2,4,2,2,2,2]) - # Threshold (i.e., level[0]): 3, NPI activation delay: 1, - # NPI lifting delay: 3. - # NPI should then start on 7th day and be lifted on 12th day - # - # We have - # [1,0,1,1,1,1,1,1,1,1,1,1,0] for inc.rolling(3+1).max() >= 3 - # which is 'int_active' and indicates correct end date but - # wrong start date. - # I.e., NPI is lifted after three days below treshold 3 but - # would have started on day 1 where the threshold 3 is only - # exceeded for the first time and not yet for one full day. - # To be more precise, activation_delay=N means that the NPI - # starts AFTER N days where incidence is exceeded and not - # ON the Nth day. - # We also have - # [1,0,0,0,0,0,1,0,0,0,0,0,0] for inc.rolling(1+1).min() >= 3 - # and - # int_potential_begin_list = [0,6] - # We then set the empty set of int_active[0:0] to zero - # In the for loop: - # Then, since 6 - 0 > npi_lifting_delay + 1 = 4 - # int_active[2,3,4,5] = 0 - # - # Note that in case of the example - # inc=pd.Series([4,4,4,2,2,4,4,2,4,2,2,2,2]) - # int_potential_begin_list = [0,1,2,6] - # and NPI would correctly start on day 2 and would not have - # been deactivated/lifted while incidence goes below 3 for - # days 4 and 5. + # With yesterdays incidence over threshold on days: + # [0,1,0,1,0,0,1,1,0,1,0,0,0] + # npi_lifting_delay=2, npi_activation_delay=1 + # NPI should be activated on day 8 and lifted on day 13 + # int_active should then be: + # [0,0,0,0,0,0,0,1,1,1,1,1,0] # - # Note 2: There may be some negligible unwanted effects in - # activation/deactivation for the first days of the full - # time series since we cannot compute min(), max() - # over the last days there (see also use of fillna(...)) - int_potential_begin_list = np.where( - (local_incid_min >= level[0]).astype(int))[0] - # Correct start date for first implementation of NPI - # (take npi_lifting_delay is needed here instead of - # npi_activation_delay since int_active is computed via - # maximum over npi_lifting_delay+1 many days) - int_active[range(max( - (int_potential_begin_list[0]-npi_lifting_delay-1), 0), int_potential_begin_list[0])] = 0 - # Correct start dates for further implementations - # and account for oscilating incidence which does - # not directly yield lifting of the implementation - for i in range(0, len(int_potential_begin_list)-1): - if int_potential_begin_list[i+1] - int_potential_begin_list[i] > npi_lifting_delay + 1: - int_active[range( - (int_potential_begin_list[i+1]-npi_lifting_delay-1), int_potential_begin_list[i+1])] = 0 + # With yesterdays incidence over threshold on days: + # [1,1,0,1,0,0,1,1,0,1,0,0,0] (as above) + # NPI should be activated on day 2 and lifted on day 13 + # int_active should then be: + # [0,1,1,1,1,1,1,1,1,1,1,1,0] + + # get a zero filled Series with same length to be + # filled with ones where NPI is active + int_active = pd.Series(np.zeros(len(local_incid), dtype=int)) + # loop over every day + for i in range(len(yesterdays_incid_over_threshold)): + # Set int_active=0 where last npi_lifting_delay+1 days are 0 + if yesterdays_incid_over_threshold[i-npi_lifting_delay:i+1].values.sum() == 0: + int_active[i] = 0 + # Set int_active=1 where last npi_activation_delay+1 days are all 1 + elif yesterdays_incid_over_threshold[i-npi_activation_delay:i+1].values.sum() == npi_activation_delay+1: + int_active[i] = 1 + # If no condition applies, set int_active to the value of the previous day + else: + int_active[i] = int_active[i-1] # multiply rows of data frame by either 1 if threshold # passed (i.e., mentioned NPI is active) or zero From e024321e3481bba2ed3dfd8939b3d5c96bc62946 Mon Sep 17 00:00:00 2001 From: Wendler Date: Mon, 21 Nov 2022 13:12:50 +0100 Subject: [PATCH 030/104] Add compareNPIData.py --- .../memilio/epidata/compareNPIData.py | 191 ++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 pycode/memilio-epidata/memilio/epidata/compareNPIData.py diff --git a/pycode/memilio-epidata/memilio/epidata/compareNPIData.py b/pycode/memilio-epidata/memilio/epidata/compareNPIData.py new file mode 100644 index 0000000000..b7871743a6 --- /dev/null +++ b/pycode/memilio-epidata/memilio/epidata/compareNPIData.py @@ -0,0 +1,191 @@ +import os +import csv +import pandas as pd +import numpy as np + +directory = '/home/wend_aa/Documents/PSS/NPIs' + +#numberofcities = 2 + +df_npis_old = pd.read_csv( + os.path.join(directory, 'old', 'kr_massnahmen_unterkategorien.csv'), + sep=',') # , nrows=numberofcities*1248 +print(df_npis_old) +numberofcities = int(len(df_npis_old) / 1248) +print('Number of cities', numberofcities) + +list = ['m01a', 'm01b', 'm02a', 'm02b', 'm03', 'm04', 'm05', 'm06', 'm07', 'm08', 'm09', + 'm10', 'm11', 'm12', 'm13', 'm14', 'm15', 'm16', 'm17', 'm18', 'm19', 'm20', 'm21'] + +#list = ['m01a', 'm01b'] + +# df_npis_new = pd.read_csv( +# os.path.join(directory, 'new', 'kr_massn_unterkat_m01a.csv'), +# sep=',', nrows=4000) + +# print(df_npis_old) + +# print(pd.testing.assert_frame_equal(df_npis_old, df_npis_new)) +# print(pd.testing.assert_frame_equal(df_npis_old, df_npis_new)) + + +# number of days that we have data for in new data set +numberofdays = 883 + +# create data frame that contains structure of old data +df = df_npis_old.iloc[:, :6] + +for i in range(numberofcities): + print('City number', i) + counter = 0 + for code in list: + # print(code) + df_npis_new = pd.read_csv( + os.path.join(directory, 'new', + 'kr_massn_unterkat_{}.csv'.format(code)), + sep=',') # , skiprows=1248 + # print(df_npis_new.iloc[0:numberofdays]) + #df_npis_new = df_npis_new.iloc[1152:] + #print('df_new', df_npis_new) + + # initialization of new data frame: + if (i == 0 and counter == 0): + #print('initialization done') + # extract dates from new data + dates = df_npis_new.iloc[:numberofdays, 5] + # rename dates so that they match dates from old npi dataframe + dates_new = [] + for date in dates: + date = ''.join(('d', date.replace('-', ''))) + dates_new.append(date) + # print(date) + + # extend data frame with columns for each date that is included in new data set + df = pd.concat([df, pd.DataFrame(columns=dates_new)]) + + # get number of codes for current subcategory, we substract 6 because we have 6 columns for bundesland, kreis, etc. + numberofcodes = 0 + numberofcodes += len(df_npis_new.columns) - 6 + #print('numberofcodes', numberofcodes) + + # insert values from new dataset into data frame + df.iloc[i*1248 + counter: i*1248 + counter + numberofcodes, + 6:] = df_npis_new.iloc[0:numberofdays, 6:].T + + #print(df.iloc[i*1248 + counter: i*1248 + counter + numberofcodes]) + + counter += numberofcodes +print(df) + +df_dropped = df +df_npis_old_dropped = df_npis_old +df_lost_subcat = df_npis_old + + +for i in range(numberofcities): + # drop entries for subcategories 22, 23, and 24 because they are not identified for new data set + #print((i+1)*counter, (i+1)*1248) + # df_dropped = df_dropped.drop( + # df_dropped.index[range((i+1)*counter, i*counter + 1248)]) + df_dropped = df_dropped.drop( + df_dropped.index[range((i+1)*counter, i*counter + 1248)]) + #print('df_dropped in loop', df_dropped.iloc[:(i+1)*counter + 3]) + # df_npis_old_dropped = df_npis_old_dropped.drop(df_npis_old_dropped.index[range( + # (i+1)*counter, i*counter + 1248)]) + df_npis_old_dropped = df_npis_old_dropped.drop(df_npis_old_dropped.index[range( + (i+1)*counter, i*counter + 1248)]) + + # extract all dropped rows from df_npis_old + # print('indices', (i)*1248, (i+1)*counter) + df_lost_subcat = df_lost_subcat.drop( + df_lost_subcat.index[range(i*(1248-counter), i*(1248-counter) + counter)]) + # print('df_dropped in loop lost subcat', + # df_lost_subcat.iloc[:(i)*(1248-counter) + 3]) + +#print('df', df.iloc[:1248-counter]) +df_dropped = df_dropped.iloc[:, :723] +print('df_dropped', df_dropped) +print('df_npis_old_dropped', df_npis_old_dropped) + + +#df = df.iloc[:counter, :723] + +#df_npis_old = df_npis_old[:counter] +#print(df_npis_old_dropped.iloc[2*counter:], df_dropped.iloc[2*counter:]) +# print('Differences between old and new data:', pd.testing.assert_frame_equal( +# df_npis_old_dropped.iloc[2*counter:, :], df_dropped.iloc[2*counter:, :], check_dtype=False)) +print('df_npis_old_dropped columns', df_npis_old_dropped.columns) +print('df_dropped columns', df_dropped.columns) + +# Check differences columnwise +nodiffcolumns = [] +for i in range(df_npis_old_dropped.shape[1]): + try: + if pd.testing.assert_frame_equal(pd.DataFrame(df_npis_old_dropped.iloc[:, i]), pd.DataFrame(df_dropped.iloc[:, i]), check_dtype=False) == None: + nodiffcolumns.append(df_npis_old_dropped.columns[i]) + #print('No difference in column', df_npis_old_dropped.columns[i]) + + except AssertionError as e: + print(e, "\n") + + +# Check differences rowwise +nodiffrows = [] +for i in range(df_npis_old_dropped.shape[0]): + try: + if pd.testing.assert_frame_equal(pd.DataFrame(df_npis_old_dropped.iloc[i]), pd.DataFrame(df_dropped.iloc[i]), check_dtype=False) == None: + nodiffrows.append( + df_npis_old_dropped.iloc[i, :6]) + #print('No difference in column', df_npis_old_dropped.columns[i]) + + except AssertionError as e: + print(e, "\n") + +# Check differences citywise +nodiffcities = [] +for i in range(numberofcities): + try: + if pd.testing.assert_frame_equal(pd.DataFrame(df_npis_old_dropped.iloc[i*counter: (i+1)*counter, :]), pd.DataFrame(df_dropped.iloc[i*counter: (i+1)*counter, :]), check_dtype=False) == None: + nodiffcities.append(df_npis_old_dropped.iloc[i*counter]['kreis']) + #print('No difference in column', df_npis_old_dropped.columns[i]) + + except AssertionError as e: + print(e, "\n") + + +# check if all values in dropped subcategories are = -99 +print('df_lostsubcat', df_lost_subcat.iloc[:, :]) +print(np.sum(np.where(df_lost_subcat.iloc[:, 6:] != -99))) +columnwisecheck = (df_lost_subcat.iloc[:, 6:] == -99).all() +checkallcolumns = (columnwisecheck == True).all() +print('Dropped subcategories have never been active:', checkallcolumns) + + +# print(nodiffcolumns) +print('Number of no diff columns', len(nodiffcolumns)) +print('Total number of columns', df_npis_old_dropped.shape[1]) + + +#print('df_npis_old_dropped columns', df_npis_old_dropped.columns) +#print('df_dropped columns', df_dropped.columns) +print('Column names are equal', + (df_npis_old_dropped.columns == df_dropped.columns).all()) + + +# print(nodiffrows) +print('Number of no diff rows', len(nodiffrows)) +print('Total number of rows', df_npis_old_dropped.shape[0]) + + +print(nodiffcities) +print('Number of no diff cities', len(nodiffcities)) +print('Total number of cities', numberofcities) + + +# save results in csv file +pd.DataFrame(nodiffcolumns).to_csv(os.path.join( + directory, 'comparedata_columns.csv')) # , nodiffrows, nodiffcities +pd.DataFrame(nodiffrows).to_csv( + os.path.join(directory, 'comparedata_rows.csv')) +pd.DataFrame(nodiffcities).to_csv( + os.path.join(directory, 'comparedata_cities.csv')) From 96d16449edc176d1cb283969799c273d3335eda0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Mon, 21 Nov 2022 14:12:46 +0100 Subject: [PATCH 031/104] . --- .../memilio/epidata/compareNPIData.py | 112 ++++++++++-------- 1 file changed, 64 insertions(+), 48 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/compareNPIData.py b/pycode/memilio-epidata/memilio/epidata/compareNPIData.py index b7871743a6..792788b5ee 100644 --- a/pycode/memilio-epidata/memilio/epidata/compareNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/compareNPIData.py @@ -3,18 +3,19 @@ import pandas as pd import numpy as np -directory = '/home/wend_aa/Documents/PSS/NPIs' +# directory = '/home/wend_aa/Documents/PSS/NPIs' +directory = 'c:\\work\\projets\\epidemiology\\code\\memilio\\data/pydata\\Germany/' #numberofcities = 2 df_npis_old = pd.read_csv( - os.path.join(directory, 'old', 'kr_massnahmen_unterkategorien.csv'), + os.path.join(directory, 'kr_massnahmen_unterkategorien.csv'), sep=',') # , nrows=numberofcities*1248 print(df_npis_old) numberofcities = int(len(df_npis_old) / 1248) print('Number of cities', numberofcities) -list = ['m01a', 'm01b', 'm02a', 'm02b', 'm03', 'm04', 'm05', 'm06', 'm07', 'm08', 'm09', +codelist = ['m01a', 'm01b', 'm02a', 'm02b', 'm03', 'm04', 'm05', 'm06', 'm07', 'm08', 'm09', 'm10', 'm11', 'm12', 'm13', 'm14', 'm15', 'm16', 'm17', 'm18', 'm19', 'm20', 'm21'] #list = ['m01a', 'm01b'] @@ -35,46 +36,61 @@ # create data frame that contains structure of old data df = df_npis_old.iloc[:, :6] -for i in range(numberofcities): - print('City number', i) - counter = 0 - for code in list: - # print(code) - df_npis_new = pd.read_csv( - os.path.join(directory, 'new', - 'kr_massn_unterkat_{}.csv'.format(code)), - sep=',') # , skiprows=1248 - # print(df_npis_new.iloc[0:numberofdays]) - #df_npis_new = df_npis_new.iloc[1152:] - #print('df_new', df_npis_new) - - # initialization of new data frame: - if (i == 0 and counter == 0): - #print('initialization done') - # extract dates from new data - dates = df_npis_new.iloc[:numberofdays, 5] - # rename dates so that they match dates from old npi dataframe - dates_new = [] - for date in dates: - date = ''.join(('d', date.replace('-', ''))) - dates_new.append(date) - # print(date) - - # extend data frame with columns for each date that is included in new data set - df = pd.concat([df, pd.DataFrame(columns=dates_new)]) - - # get number of codes for current subcategory, we substract 6 because we have 6 columns for bundesland, kreis, etc. - numberofcodes = 0 - numberofcodes += len(df_npis_new.columns) - 6 - #print('numberofcodes', numberofcodes) - - # insert values from new dataset into data frame - df.iloc[i*1248 + counter: i*1248 + counter + numberofcodes, - 6:] = df_npis_new.iloc[0:numberofdays, 6:].T - - #print(df.iloc[i*1248 + counter: i*1248 + counter + numberofcodes]) - - counter += numberofcodes +start_county = 1 + +df_local = [pd.DataFrame() for i in range(401)] +counter_col = 0 +for code in codelist: + # print(code) + df_npis_new = pd.read_csv( + os.path.join(directory, 'new', + 'kr_massn_unterkat_{}.csv'.format(code)), + sep=',') # , skiprows=1248 + # print(df_npis_new.iloc[0:numberofdays]) + #df_npis_new = df_npis_new.iloc[1152:] + #print('df_new', df_npis_new) + counties = np.sort(df_npis_new.ags5.unique()) + if len(df_npis_new) / len(counties) != numberofdays: + print('error') + if len(counties) != 401: + print('error') + + # extract dates from new data + dates = df_npis_new.iloc[:numberofdays, 5] + # rename dates so that they match dates from old npi dataframe + dates_new = ['d' + date.replace('-', '') for date in dates] + + for i in range(0,401): + if counter_col == 0: + df_local[i] = pd.DataFrame(columns=list(df_npis_new.columns[0:5]) + ['code'] + dates_new) + else: + print('todo') + + df_npis_new[df_npis_new.ags5 == counties[i]].iloc[:, 6:].T.reset_index() + + if counter_col == 0: + df = df_npis_new.copy() + else: + # extend data frame with columns for each date that is included in new data set + df = pd.concat([df, pd.DataFrame(columns=list(df_npis_new.columns[0:5])+ dates_new)]) + + for i in range(start_county, numberofcities): + print('County number', i) + counter_col = 0 + + + + # get number of codes for current subcategory, we substract 6 because we have 6 columns for bundesland, kreis, etc. + numberofcodes = len(df_npis_new.columns) - 6 + #print('numberofcodes', numberofcodes) + + # insert values from new dataset into data frame + df_local[0].iloc[:, 6:] = df_npis_new.iloc[0:numberofdays, 6:].T + + #print(df.iloc[i*1248 + counter: i*1248 + counter + numberofcodes]) + + counter_col += numberofcodes + counter_counties += 1 print(df) df_dropped = df @@ -88,17 +104,17 @@ # df_dropped = df_dropped.drop( # df_dropped.index[range((i+1)*counter, i*counter + 1248)]) df_dropped = df_dropped.drop( - df_dropped.index[range((i+1)*counter, i*counter + 1248)]) + df_dropped.index[range((i+1)*counter_col, i*counter_col + 1248)]) #print('df_dropped in loop', df_dropped.iloc[:(i+1)*counter + 3]) # df_npis_old_dropped = df_npis_old_dropped.drop(df_npis_old_dropped.index[range( # (i+1)*counter, i*counter + 1248)]) df_npis_old_dropped = df_npis_old_dropped.drop(df_npis_old_dropped.index[range( - (i+1)*counter, i*counter + 1248)]) + (i+1)*counter_col, i*counter_col + 1248)]) # extract all dropped rows from df_npis_old # print('indices', (i)*1248, (i+1)*counter) df_lost_subcat = df_lost_subcat.drop( - df_lost_subcat.index[range(i*(1248-counter), i*(1248-counter) + counter)]) + df_lost_subcat.index[range(i*(1248-counter_col), i*(1248-counter_col) + counter_col)]) # print('df_dropped in loop lost subcat', # df_lost_subcat.iloc[:(i)*(1248-counter) + 3]) @@ -145,8 +161,8 @@ nodiffcities = [] for i in range(numberofcities): try: - if pd.testing.assert_frame_equal(pd.DataFrame(df_npis_old_dropped.iloc[i*counter: (i+1)*counter, :]), pd.DataFrame(df_dropped.iloc[i*counter: (i+1)*counter, :]), check_dtype=False) == None: - nodiffcities.append(df_npis_old_dropped.iloc[i*counter]['kreis']) + if pd.testing.assert_frame_equal(pd.DataFrame(df_npis_old_dropped.iloc[i*counter_col: (i+1)*counter_col, :]), pd.DataFrame(df_dropped.iloc[i*counter_col: (i+1)*counter_col, :]), check_dtype=False) == None: + nodiffcities.append(df_npis_old_dropped.iloc[i*counter_col]['kreis']) #print('No difference in column', df_npis_old_dropped.columns[i]) except AssertionError as e: From 5382c3d0fbc9155872277342599c81acd5ebc308 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Mon, 21 Nov 2022 16:15:08 +0100 Subject: [PATCH 032/104] compare data and use & sign --- .../memilio/epidata/compareNPIData.py | 31 ++++++------------- .../memilio/epidata/transformNPIData.py | 6 ++-- 2 files changed, 13 insertions(+), 24 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/compareNPIData.py b/pycode/memilio-epidata/memilio/epidata/compareNPIData.py index 792788b5ee..a952c454c0 100644 --- a/pycode/memilio-epidata/memilio/epidata/compareNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/compareNPIData.py @@ -60,37 +60,26 @@ # rename dates so that they match dates from old npi dataframe dates_new = ['d' + date.replace('-', '') for date in dates] + if (counter_col > 0) and ((old_cols != df_npis_new.columns[0:5]).any()): + print('error') + for i in range(0,401): if counter_col == 0: df_local[i] = pd.DataFrame(columns=list(df_npis_new.columns[0:5]) + ['code'] + dates_new) - else: - print('todo') - - df_npis_new[df_npis_new.ags5 == counties[i]].iloc[:, 6:].T.reset_index() - - if counter_col == 0: - df = df_npis_new.copy() - else: - # extend data frame with columns for each date that is included in new data set - df = pd.concat([df, pd.DataFrame(columns=list(df_npis_new.columns[0:5])+ dates_new)]) - for i in range(start_county, numberofcities): - print('County number', i) - counter_col = 0 + dummy_to_append = pd.DataFrame(columns=['code'] + dates_new, data=df_npis_new[df_npis_new.ags5 == counties[i]].iloc[:, 6:].T.reset_index().values.copy()) + df_local[i] = pd.concat([df_local[i], dummy_to_append]) + old_cols = df_npis_new.columns[0:5] - # get number of codes for current subcategory, we substract 6 because we have 6 columns for bundesland, kreis, etc. - numberofcodes = len(df_npis_new.columns) - 6 - #print('numberofcodes', numberofcodes) + counter_col += 1 - # insert values from new dataset into data frame - df_local[0].iloc[:, 6:] = df_npis_new.iloc[0:numberofdays, 6:].T +# set names for all rows of county +for i in range(0,401): + df_local[i][old_cols] = df_npis_new[df_npis_new.ags5 == counties[i]].iloc[0,0:5] - #print(df.iloc[i*1248 + counter: i*1248 + counter + numberofcodes]) - counter_col += numberofcodes - counter_counties += 1 print(df) df_dropped = df diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index 2e55f54fc4..bfb79d77cf 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -740,10 +740,10 @@ def transform_npi_data(fine_resolution=2, countyID, 'Incidence'] = df_infec_local['Incidence'].values # cut infection information at start_date_new and end_date_new - df_infec_local = df_infec_local[df_infec_local + df_infec_local = df_infec_local[(df_infec_local [dd.EngEng['date']] >= - start_date_new][ - df_infec_local[dd.EngEng['date']] <= end_date_new].reset_index() + start_date_new) and + (df_infec_local[dd.EngEng['date']] <= end_date_new)].reset_index() # get county-local data frame start_time = time.perf_counter() From 162f44247ef79a60fbc0c4ec17b534fc2565f187 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Mon, 21 Nov 2022 17:17:28 +0100 Subject: [PATCH 033/104] sign corrected --- pycode/memilio-epidata/memilio/epidata/transformNPIData.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index bfb79d77cf..ed94bdf316 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -740,10 +740,8 @@ def transform_npi_data(fine_resolution=2, countyID, 'Incidence'] = df_infec_local['Incidence'].values # cut infection information at start_date_new and end_date_new - df_infec_local = df_infec_local[(df_infec_local - [dd.EngEng['date']] >= - start_date_new) and - (df_infec_local[dd.EngEng['date']] <= end_date_new)].reset_index() + df_infec_local = df_infec_local[(df_infec_local[dd.EngEng['date']] >= start_date_new) & ( + df_infec_local[dd.EngEng['date']] <= end_date_new)].reset_index() # get county-local data frame start_time = time.perf_counter() From eb3b5a97bd8419028186d2b4b27bdfdfdeb49ebd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Mon, 21 Nov 2022 17:19:06 +0100 Subject: [PATCH 034/104] testing --- pycode/memilio-epidata/memilio/epidata/compareNPIData.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pycode/memilio-epidata/memilio/epidata/compareNPIData.py b/pycode/memilio-epidata/memilio/epidata/compareNPIData.py index a952c454c0..fc6f1d04a6 100644 --- a/pycode/memilio-epidata/memilio/epidata/compareNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/compareNPIData.py @@ -77,8 +77,9 @@ # set names for all rows of county for i in range(0,401): + print('county ' + str(i)) df_local[i][old_cols] = df_npis_new[df_npis_new.ags5 == counties[i]].iloc[0,0:5] - + pd.testing.assert_frame_equal(df_npis_old[df_npis_old.ags5==counties[i]].iloc[:1152,6:].reset_index(drop=True), df_local[i].iloc[:,6:723].reset_index(drop=True), check_dtype=False) print(df) From 4a46b1f3bfb9ab7fc55b962bc9cbf7d1a3545065 Mon Sep 17 00:00:00 2001 From: patricklnz Date: Thu, 24 Nov 2022 13:42:27 +0100 Subject: [PATCH 035/104] performance improvement --- .../memilio/epidata/transformNPIData.py | 90 ++++++++----------- 1 file changed, 38 insertions(+), 52 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index ed94bdf316..ba545ce6aa 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -299,7 +299,7 @@ def transform_npi_data(fine_resolution=2, # defines delay in number of days between exceeding # incidence threshold and NPI getting active # delay = 0 means only one day is considered (=no delay) - npi_activation_delay = 0 + npi_activation_delay = 2 npi_lifting_delay = 4 # for NRW, BW # 2 for bayern # we use npi_lifting_delay = 4 as this is the most common @@ -353,7 +353,7 @@ def transform_npi_data(fine_resolution=2, df_npis_desc = pd.read_excel( os.path.join( directory, 'datensatzbeschreibung_massnahmen.xlsx'), - sheet_name=3) + sheet_name=3, engine = 'openpyxl') except FileNotFoundError: print_manual_download( 'datensatzbeschreibung_massnahmen.xlsx', @@ -426,6 +426,8 @@ def transform_npi_data(fine_resolution=2, df_npis_combinations[npi_groups_combinations_unique[i]][1].insert( 0, 'Code', codes_local) + del df_npis_combinations_pre + # use to_excel function and specify the sheet_name and index # to store the dataframe in specified sheet if file not yet existent # otherwise just valid results against stored sheets @@ -454,12 +456,14 @@ def transform_npi_data(fine_resolution=2, sheet_name=i, engine = 'openpyxl') if not df_in_valid.drop(columns='Unnamed: 0').equals(df_out): print('Error in combination matrix.') + del df_in_valid except: pass if write_file: df_out.to_excel( writer, sheet_name=npi_groups_combinations_unique[i]) + del df_out if write_file: writer.save() @@ -473,6 +477,7 @@ def transform_npi_data(fine_resolution=2, i), 'M04_110_'+str(i), 'M04_100_'+str(i), 'M04_130_'+str(i), 'M04_140_'+str(i)] # correct M05_N codes to M_05_M_N, N in {1,...,5}, M in {130,150,120,140,110,100,160} + # TODO: check for i in range(1, 6): npi_codes_prior[npi_codes_prior == 'M05_'+str(i)] = ['M05_130_'+str(i), 'M05_150_'+str( i), 'M05_120_'+str(i), 'M05_140_'+str(i), 'M05_110_'+str(i), 'M05_100_'+str(i), 'M05_160_'+str(i)] @@ -491,7 +496,9 @@ def transform_npi_data(fine_resolution=2, if fine_resolution == 1: missing_grouped_codes = [] for mcode in missing_codes: - if len(mcode.split('_')) < 2: + # only consider incidence independent npis + # only exit if one of these (i.e., MCODE_NUMBER) is missing + if len(mcode.split('_')) != 3: missing_grouped_codes.append(mcode) if len(missing_grouped_codes) > 0: # only MCODE_NUMBER codes sys.exit('Missing NPI codes: ' + @@ -551,13 +558,14 @@ def transform_npi_data(fine_resolution=2, # extract variable names for main categories npi_desc = list(df_npis_desc["Variable"][npi_codes_sorting]) + del df_npis_desc + # combine NPI codes and descriptions to ensure that both are ordered # the same way; delete npi_codes or npi_desc for not using hereafter idx_codes_retained = ~pd.Series(npi_codes).isin(codes_dropped) - npis_dummy = { + npis = pd.DataFrame({ dd.EngEng['npiCode']: list(pd.Series(npi_codes)[idx_codes_retained]), - dd.EngEng['desc']: list(pd.Series(npi_desc)[idx_codes_retained])} - npis = pd.DataFrame(npis_dummy) + dd.EngEng['desc']: list(pd.Series(npi_desc)[idx_codes_retained])}) del npi_codes del npi_desc # remove rows and columns of unused codes @@ -597,23 +605,23 @@ def transform_npi_data(fine_resolution=2, npi_codes_aggregated)].reset_index() else: npis_final = npis - + del npis # extract incidence-threshold for NPIs if fine_resolution > 0: npi_incid_start = dict() - for i in range(len(npis)): + for i in range(len(npis_final)): incid_threshold = 1e5 - if npis.loc[i, dd.EngEng['desc']].split(' ')[0] == 'Unabhängig': + if npis_final.loc[i, dd.EngEng['desc']].split(' ')[0] == 'Unabhängig': # set -1 for incidence-independent NPIs incid_threshold = -1 - elif npis.loc[i, dd.EngEng['desc']].split(' ')[0] == 'Ab': + elif npis_final.loc[i, dd.EngEng['desc']].split(' ')[0] == 'Ab': incid_threshold = int( - npis.loc[i, dd.EngEng['desc']].split(' ')[1]) + npis_final.loc[i, dd.EngEng['desc']].split(' ')[1]) else: sys.exit( 'Error in description file. NPI activation can not ' 'be computed. Exiting.') - npi_incid_start[npis.loc[i, dd.EngEng['npiCode']] + npi_incid_start[npis_final.loc[i, dd.EngEng['npiCode']] ] = incid_threshold # get all incidence thresholds (This list has to be sorted) @@ -633,8 +641,8 @@ def transform_npi_data(fine_resolution=2, # create hash map from thresholds to NPI indices incidence_thresholds_to_npis = dict( zip(incidence_thresholds, [[] for i in range(len(incidence_thresholds))])) - for i in range(len(npis)): - code_considered = npis.loc[i, dd.EngEng['npiCode']] + for i in range(len(npis_final)): + code_considered = npis_final.loc[i, dd.EngEng['npiCode']] incval = npi_incid_start[code_considered] if len(code_considered.split('_')) < 3: incidence_thresholds_to_npis[(incval, '')].append(i) @@ -717,10 +725,13 @@ def transform_npi_data(fine_resolution=2, ['int' for i in npis_final[dd.EngEng['npiCode']]]))) # iterate over countyIDs - counters = np.zeros(5) # time counter for output only + counters = np.zeros(4) # time counter for output only countyidx = 0 - # unique_geo_entities: - for countyID in [5315, 1001, 9162, 16071, 11000, 1060, 5566]: + # replace -99 ("not used anymore") by 0 ("not used") + # replace 2,3,4,5 ("mentioned in ...") by 1 ("mentioned") + df_npis_old.replace([-99, 2, 3, 4, 5], [0, 1, 1, 1, 1], inplace=True) + + for countyID in unique_geo_entities: cid = 0 countyidx += 1 @@ -731,9 +742,8 @@ def transform_npi_data(fine_resolution=2, pop_local = df_population.loc[df_population[dd.EngEng['idCounty']] == countyID, dd.EngEng['population']].values[0] # consider difference between current day and day-7 to compute incidence - incidence_local = df_infec_local[dd.EngEng['confirmed']].diff( + df_infec_local['Incidence'] = df_infec_local[dd.EngEng['confirmed']].diff( periods=7).fillna(df_infec_local[dd.EngEng['confirmed']]) / pop_local * 100000 - df_infec_local['Incidence'] = incidence_local # set to main data frame df_infec_rki.loc[df_infec_rki[dd.EngEng['idCounty']] == @@ -749,50 +759,26 @@ def transform_npi_data(fine_resolution=2, == countyID].copy() # potentially remove rows if they are not in npis dict - npi_rows = [i in npis[dd.EngEng['npiCode']].values + npi_rows = [i in npis_final[dd.EngEng['npiCode']].values for i in df_local_old[dd.EngEng['npiCode']]] - # get list of NPI codes, ordered as the rows in the current data frame - npi_codes_ordered_as_rows = npis_final['NPI_code'].to_list() - # get indices of rows for the NPI codes as in the sorted npi_codes list - # may be superfluous if NPI code rows are sorted correctly - npi_code_rows_to_sorted = [ - npi_codes_ordered_as_rows.index(i) for i in - npis[dd.EngEng['npiCode']].values] - - # access NPI values matrix and store it as integers - npi_vals = df_local_old.iloc[npi_rows, start_npi_cols:].astype(int) - - # create columns for date, county ID and NPI code + # create columns for date, county ID df_local_new = pd.DataFrame( - columns=[dd.EngEng['date']] + [dd.EngEng['idCounty']] + - list(npis[dd.EngEng['npiCode']])) + columns=[dd.EngEng['date']] + [dd.EngEng['idCounty']]) counters[cid] += time.perf_counter()-start_time cid += 1 start_time = time.perf_counter() + npis_new = df_local_old.iloc[npi_rows, start_npi_cols-1:].set_index(dd.EngEng['npiCode']).transpose().reset_index(drop=True).copy() # fill in NPI values by transposing from columns to rows df_local_new[dd.EngEng['date']] = dates_new df_local_new[dd.EngEng['idCounty']] = countyID # possible resorting of rows such that they are sorted according to # a literal sorting of the code strings - df_local_new[npis[dd.EngEng['npiCode']]] = np.transpose( - npi_vals.iloc[npi_code_rows_to_sorted, :].values) - - counters[cid] += time.perf_counter()-start_time - cid += 1 - - start_time = time.perf_counter() - - # replace -99 ("not used anymore") by 0 ("not used") - df_local_new[npis[dd.EngEng['npiCode']] - ] = df_local_new[npis[dd.EngEng['npiCode']]].replace(-99, 0) - # replace 2,3,4,5 ("mentioned in ...") by 1 ("mentioned") - df_local_new[npis[dd.EngEng['npiCode']] - ] = df_local_new[npis[dd.EngEng['npiCode']]].replace([2, 3, 4, 5], 1) + df_local_new = pd.concat([df_local_new.copy(), npis_new], axis = 1) counters[cid] += time.perf_counter()-start_time cid += 1 @@ -815,7 +801,7 @@ def transform_npi_data(fine_resolution=2, # get index of first NPI column in local data frame npis_idx_start = list( df_local_new.columns).index( - npis[dd.EngEng['npiCode']][0]) + npis_final[dd.EngEng['npiCode']][0]) # iterate through all NPIs and activate if incidence threshold # is exceeded @@ -940,6 +926,7 @@ def transform_npi_data(fine_resolution=2, for main_code, codes_group in maincode_to_npicodes_map.items(): # group by incidence (former codes X1_Y, X1_Z were transformed # to X1, X2) and write max value to main code column + # TODO: check df_local_new.loc[:, main_code] = df_local_new.loc[:, codes_group].max( axis=1) # remove subcategory columns @@ -952,8 +939,7 @@ def transform_npi_data(fine_resolution=2, start_time = time.perf_counter() - df_npis = pd.concat([df_npis, df_local_new], ignore_index=True) - + df_npis = pd.concat([df_npis.copy(), df_local_new.copy()], ignore_index=True) counters[cid] += time.perf_counter()-start_time cid += 1 @@ -1038,7 +1024,7 @@ def transform_npi_data(fine_resolution=2, # [:, start_npi_cols - 1:])[0]) > 0: # print('Error in file writing/reading') npi_codes_considered = [] #which codes? - analyze_npi_data(True, True, fine_resolution, npis, directory, file_format, npi_codes_considered) + analyze_npi_data(True, True, fine_resolution, npis_final, directory, file_format, npi_codes_considered) def analyze_npi_data(read_data, make_plot, fine_resolution, npis, directory, file_format, npi_codes_considered): From 3d3e4e098161795031781d8a86dde00586ba21cc Mon Sep 17 00:00:00 2001 From: patricklnz Date: Mon, 9 Jan 2023 09:42:08 +0100 Subject: [PATCH 036/104] fix error --- .../memilio/epidata/transformNPIData.py | 28 +++++++++---------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index ba545ce6aa..8b5355b04f 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -477,7 +477,6 @@ def transform_npi_data(fine_resolution=2, i), 'M04_110_'+str(i), 'M04_100_'+str(i), 'M04_130_'+str(i), 'M04_140_'+str(i)] # correct M05_N codes to M_05_M_N, N in {1,...,5}, M in {130,150,120,140,110,100,160} - # TODO: check for i in range(1, 6): npi_codes_prior[npi_codes_prior == 'M05_'+str(i)] = ['M05_130_'+str(i), 'M05_150_'+str( i), 'M05_120_'+str(i), 'M05_140_'+str(i), 'M05_110_'+str(i), 'M05_100_'+str(i), 'M05_160_'+str(i)] @@ -569,7 +568,7 @@ def transform_npi_data(fine_resolution=2, del npi_codes del npi_desc # remove rows and columns of unused codes - for code in df_npis_combinations.keys(): + for code in df_npis_combinations.keys(): # does not work for fine_resolution!=2 local_codes_used_rows = df_npis_combinations[code][1].Code.isin( npis.NPI_code) local_codes_used_cols = df_npis_combinations[code][1].columns.isin( @@ -605,23 +604,23 @@ def transform_npi_data(fine_resolution=2, npi_codes_aggregated)].reset_index() else: npis_final = npis - del npis + # extract incidence-threshold for NPIs if fine_resolution > 0: npi_incid_start = dict() - for i in range(len(npis_final)): + for i in range(len(npis)): incid_threshold = 1e5 - if npis_final.loc[i, dd.EngEng['desc']].split(' ')[0] == 'Unabhängig': + if npis.loc[i, dd.EngEng['desc']].split(' ')[0] == 'Unabhängig': # set -1 for incidence-independent NPIs incid_threshold = -1 - elif npis_final.loc[i, dd.EngEng['desc']].split(' ')[0] == 'Ab': + elif npis.loc[i, dd.EngEng['desc']].split(' ')[0] == 'Ab': incid_threshold = int( - npis_final.loc[i, dd.EngEng['desc']].split(' ')[1]) + npis.loc[i, dd.EngEng['desc']].split(' ')[1]) else: sys.exit( 'Error in description file. NPI activation can not ' 'be computed. Exiting.') - npi_incid_start[npis_final.loc[i, dd.EngEng['npiCode']] + npi_incid_start[npis.loc[i, dd.EngEng['npiCode']] ] = incid_threshold # get all incidence thresholds (This list has to be sorted) @@ -641,8 +640,8 @@ def transform_npi_data(fine_resolution=2, # create hash map from thresholds to NPI indices incidence_thresholds_to_npis = dict( zip(incidence_thresholds, [[] for i in range(len(incidence_thresholds))])) - for i in range(len(npis_final)): - code_considered = npis_final.loc[i, dd.EngEng['npiCode']] + for i in range(len(npis)): + code_considered = npis.loc[i, dd.EngEng['npiCode']] incval = npi_incid_start[code_considered] if len(code_considered.split('_')) < 3: incidence_thresholds_to_npis[(incval, '')].append(i) @@ -759,7 +758,7 @@ def transform_npi_data(fine_resolution=2, == countyID].copy() # potentially remove rows if they are not in npis dict - npi_rows = [i in npis_final[dd.EngEng['npiCode']].values + npi_rows = [i in npis[dd.EngEng['npiCode']].values for i in df_local_old[dd.EngEng['npiCode']]] @@ -801,7 +800,7 @@ def transform_npi_data(fine_resolution=2, # get index of first NPI column in local data frame npis_idx_start = list( df_local_new.columns).index( - npis_final[dd.EngEng['npiCode']][0]) + npis[dd.EngEng['npiCode']][0]) # iterate through all NPIs and activate if incidence threshold # is exceeded @@ -926,7 +925,6 @@ def transform_npi_data(fine_resolution=2, for main_code, codes_group in maincode_to_npicodes_map.items(): # group by incidence (former codes X1_Y, X1_Z were transformed # to X1, X2) and write max value to main code column - # TODO: check df_local_new.loc[:, main_code] = df_local_new.loc[:, codes_group].max( axis=1) # remove subcategory columns @@ -975,7 +973,7 @@ def transform_npi_data(fine_resolution=2, start_date_validation = datetime(2020, 3, 1) end_date_validation = datetime(2022, 2, 15) - for countyID in [5315, 1001, 9162, 16071, 11000, 1060, 5566]: + for countyID in unique_geo_entities: for npiCode in [ 'M01a_010', 'M01a_150', 'M05_120', 'M01a_010', 'M18_030', 'M01b_020', 'M02b_035', 'M16_050']: @@ -994,7 +992,7 @@ def transform_npi_data(fine_resolution=2, start_date_validation = datetime(2020, 3, 1) end_date_validation = datetime(2022, 2, 15) - for countyID in [5315, 1001, 9162, 16071, 11000, 1060, 5566]: + for countyID in unique_geo_entities: for npiCode in [ 'M01a_010', 'M01a_150', 'M05_120', 'M01a_010', 'M18_030', 'M01b_020', 'M02b_035', 'M16_050']: From 309b5c171fda377cd42441e409a7d0d55b348a9d Mon Sep 17 00:00:00 2001 From: patricklnz Date: Mon, 9 Jan 2023 14:16:11 +0100 Subject: [PATCH 037/104] create df_local_new directly from df_local_old --- .../memilio/epidata/transformNPIData.py | 67 +++++++++++-------- 1 file changed, 39 insertions(+), 28 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index 8b5355b04f..e4d0a3a874 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -256,7 +256,8 @@ def transform_npi_data(fine_resolution=2, file_format=dd.defaultDict['file_format'], out_folder=dd.defaultDict['out_folder'], start_date=dd.defaultDict['start_date'], - end_date=dd.defaultDict['end_date'] + end_date=dd.defaultDict['end_date'], + counties_considered=geoger.get_county_ids() ): """! Loads a certain resolution of recorded NPI data from the Corona Datenplattform and transforms it according to the @@ -364,13 +365,13 @@ def transform_npi_data(fine_resolution=2, npi_codes_prior = df_npis_desc['Variablenname'] npi_codes_prior_desc = df_npis_desc['Variable'] - # for fine_resolution == 2 deactivation of non-combinable + # for fine_resolution > 0 deactivation of non-combinable # incidence-dependent NPIs has to be conducted; therefore we defined a # matrix of possible combinations of NPIs (marked with an X if combinable) # NPIs of different main category (e.g., M01a and M04) can always be # combined; only those of, e.g., M01a_010_3 and M01a_080_4 can exclude each # other - if fine_resolution == 2: + if fine_resolution > 0: df_npis_combinations_pre = pd.read_excel( os.path.join( directory, 'combination_npis.xlsx'), engine = 'openpyxl') @@ -568,15 +569,16 @@ def transform_npi_data(fine_resolution=2, del npi_codes del npi_desc # remove rows and columns of unused codes - for code in df_npis_combinations.keys(): # does not work for fine_resolution!=2 - local_codes_used_rows = df_npis_combinations[code][1].Code.isin( - npis.NPI_code) - local_codes_used_cols = df_npis_combinations[code][1].columns.isin( - npis.NPI_code) + if fine_resolution > 0: + for code in df_npis_combinations.keys(): + local_codes_used_rows = df_npis_combinations[code][1].Code.isin( + npis.NPI_code) + local_codes_used_cols = df_npis_combinations[code][1].columns.isin( + npis.NPI_code) - # overwrite item 0 since codes are stored in *.columns - df_npis_combinations[code] = df_npis_combinations[code][1].loc[local_codes_used_rows, - local_codes_used_cols].reset_index(drop=True).copy() + # overwrite item 0 since codes are stored in *.columns + df_npis_combinations[code] = df_npis_combinations[code][1].loc[local_codes_used_rows, + local_codes_used_cols].reset_index(drop=True).copy() # prepare grouping of NPIs to reduce product space of # NPI x active_from_inc (with values "incidence does not matter", and @@ -649,18 +651,16 @@ def transform_npi_data(fine_resolution=2, incidence_thresholds_to_npis[( incval, '_' + code_considered.split('_')[2])].append(i) - # get county ids - unique_geo_entities = geoger.get_county_ids() # check if more than the county of Eisenach would be removed with # current county list counties_removed = df_npis_old[ - ~df_npis_old[dd.EngEng['idCounty']].isin(unique_geo_entities)][ + ~df_npis_old[dd.EngEng['idCounty']].isin(counties_considered)][ dd.EngEng['idCounty']].unique() if list(counties_removed) != [16056]: sys.exit('Error. Other counties than that of Eisenach were removed.') # remove rows for Eisenach df_npis_old = df_npis_old[df_npis_old[dd.EngEng['idCounty']].isin( - unique_geo_entities)].reset_index(drop=True) + counties_considered)].reset_index(drop=True) start_npi_cols = list( df_npis_old.columns).index( @@ -730,7 +730,7 @@ def transform_npi_data(fine_resolution=2, # replace 2,3,4,5 ("mentioned in ...") by 1 ("mentioned") df_npis_old.replace([-99, 2, 3, 4, 5], [0, 1, 1, 1, 1], inplace=True) - for countyID in unique_geo_entities: + for countyID in counties_considered: cid = 0 countyidx += 1 @@ -770,14 +770,25 @@ def transform_npi_data(fine_resolution=2, cid += 1 start_time = time.perf_counter() - npis_new = df_local_old.iloc[npi_rows, start_npi_cols-1:].set_index(dd.EngEng['npiCode']).transpose().reset_index(drop=True).copy() - # fill in NPI values by transposing from columns to rows - df_local_new[dd.EngEng['date']] = dates_new + # old dataframe has npi codes as columns and date values as rows + # new dataframe should be transposed + df_local_new = df_local_old.iloc[npi_rows, start_npi_cols-1:].set_index( + dd.EngEng['npiCode']).transpose().copy() + # get datetime as a column (previously index after transposing) + df_local_new = df_local_new.reset_index( + drop=False).rename( + columns={'index': dd.EngEng['date']}) + # reset index name (which is dd.EngEng['npiCode'] after transposing) + df_local_new.rename_axis('', axis=1, inplace=True) + # change time format from 'dYYYYMMDD' to datetime timestamps + df_local_new[dd.EngEng['date']] = pd.to_datetime( + df_local_new[dd.EngEng['date']], format='d%Y%m%d') + # fill in column for county ID df_local_new[dd.EngEng['idCounty']] = countyID - # possible resorting of rows such that they are sorted according to - # a literal sorting of the code strings - df_local_new = pd.concat([df_local_new.copy(), npis_new], axis = 1) + # sort columns as to {Date, ID_County, npi_codes...} + # for now this can be done alphabetically + df_local_new.sort_index(axis=1, inplace=True) counters[cid] += time.perf_counter()-start_time cid += 1 @@ -944,12 +955,12 @@ def transform_npi_data(fine_resolution=2, # divide working time by completed number of counties and multiply # by remaining number of counties to estimate time remaining time_remain = sum( - counters) / countyidx * (len(unique_geo_entities) - countyidx) + counters) / countyidx * (len(counties_considered) - countyidx) # print progress if countyidx == 1 or countyidx % int( - len(unique_geo_entities) / 10) == 0: + len(counties_considered) / 10) == 0: print('Progress ' + str(countyidx) + ' / ' + - str(len(unique_geo_entities)) + + str(len(counties_considered)) + '. Estimated time remaining: ' + str(int(time_remain / 60)) + ' min.') @@ -973,7 +984,7 @@ def transform_npi_data(fine_resolution=2, start_date_validation = datetime(2020, 3, 1) end_date_validation = datetime(2022, 2, 15) - for countyID in unique_geo_entities: + for countyID in counties_considered: for npiCode in [ 'M01a_010', 'M01a_150', 'M05_120', 'M01a_010', 'M18_030', 'M01b_020', 'M02b_035', 'M16_050']: @@ -992,7 +1003,7 @@ def transform_npi_data(fine_resolution=2, start_date_validation = datetime(2020, 3, 1) end_date_validation = datetime(2022, 2, 15) - for countyID in unique_geo_entities: + for countyID in counties_considered: for npiCode in [ 'M01a_010', 'M01a_150', 'M05_120', 'M01a_010', 'M18_030', 'M01b_020', 'M02b_035', 'M16_050']: @@ -1318,4 +1329,4 @@ def main(): if __name__ == "__main__": - main() + main() \ No newline at end of file From b47e3b22e1ca5069fcc180de054df8bcf6bfb34e Mon Sep 17 00:00:00 2001 From: patricklnz Date: Mon, 9 Jan 2023 14:22:38 +0100 Subject: [PATCH 038/104] remove analysis from tronsformNPIData file (moved to Issue #444) --- .../memilio/epidata/transformNPIData.py | 438 ------------------ 1 file changed, 438 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py index e4d0a3a874..1bf9b65ea8 100644 --- a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/transformNPIData.py @@ -33,150 +33,6 @@ from memilio.epidata import customPlot -def evaluate_clustering(corr_mat, idx_to_cluster_idx, indices_all): - """! Computes a score for a particular clustering based on the - correlation matrix. The score is computed as the percentage of 'higher' - or 'high' values (e.g., between 0.5 and 0.75 or 0.75 and 1) of the - correlation matrix that are to be found in the diagonal blocks of the - clustered correlation matrix vs these values in the offdiagonal blocks. - - @param corr_mat correlation matrix between the features / data set items - that were clustered. - @param idx_to_cluster_idx Mapping of data item to cluster index. - @param indices_all List of indices of all data items. - - @return Scores for the provided clustering. - """ - - if idx_to_cluster_idx.min() == 1: - idx_to_cluster_idx -= 1 - - # store indices of clusters - clusters = [[] for i in range(idx_to_cluster_idx.max()+1)] - for ii in range(len(idx_to_cluster_idx)): - clusters[idx_to_cluster_idx[ii]].append(ii) - # store remaining/perpendicular indices for all clusters - clusters_perp = [[] for i in range(idx_to_cluster_idx.max()+1)] - for ii in range(len(clusters)): - clusters_perp[ii] = list(indices_all.difference(set(clusters[ii]))) - # extract correlation values of block diagonals and offdiagonals separ. - corr_diag = [] - corr_offdiag = [] - for ii in range(len(clusters)): - corr_diag = np.append(corr_diag, abs( - corr_mat[np.ix_(clusters[ii], clusters[ii])].flatten())) - corr_offdiag = np.append(corr_offdiag, abs( - corr_mat[np.ix_(clusters[ii], clusters_perp[ii])].flatten())) - - corr_thresholds = [0.25, 0.5, 0.75] - cluster_quantification = np.zeros(6) - for ii in range(len(corr_thresholds)): - num_diag = len(np.where(corr_diag > corr_thresholds[ii])[0]) - num_offdiag = len(np.where(corr_offdiag > corr_thresholds[ii])[0]) - if ii < len(corr_thresholds)-1: - num_diag -= len(np.where(corr_diag > corr_thresholds[ii+1])[0]) - num_offdiag -= len(np.where(corr_offdiag > - corr_thresholds[ii+1])[0]) - cluster_quantification[2*ii] = num_diag / (num_diag+num_offdiag) - cluster_quantification[2*ii+1] = ( - num_diag+num_offdiag) / (len(indices_all)**2) - - # print scores on clustering - print("Number of clusters: " + str(len(clusters)) + - ", shares diag/all between [0.25, 0.5, 0.75]: %.4f" % - cluster_quantification[0] + " (%.4f" % cluster_quantification[1] + - "), " + " %.4f " % cluster_quantification[2] + " (%.4f" % - cluster_quantification[3] + "), " + " %.4f " % - cluster_quantification[4] + " (%.4f" % cluster_quantification[5] + ")") - - return cluster_quantification - - -def compute_hierarch_clustering(corr_mat, corr_pairwdist, - metrics=['single', 'complete', 'average', - 'weighted', 'centroid', 'median', - 'ward']): - """! Computes a hierarchical clustering for a (list of) metric(s) and - provides the maximum cophenetic distance(s) as well as a score for the - clustering (see @method evaluate_clustering(...)). - - @param corr_mat correlation matrix between the features / data set items - to be clustered hierarchically. - @param corr_pairwdist Computed pairwise distance between the features / data - set items. - @param metric Metric or list of metrics to compute the hierarchical - clustering. - - @return (List of) hierarchical clustering(s), maximum cophenetic distance(s) - and scores of the hierarchical clustering. - """ - # NOTE: if changing metric, pay attention to linkage methods; - # 'centroid', 'median', and 'ward' are correctly defined only if - # Euclidean pairwise metric is used. - # Based on the distances, we compute an hierarchical clustering for - # different metrics - max_coph_corr = 0 - scores = dict() - # allow single entry - if not isinstance(metrics, list): - metrics = [metrics] - # iterate over list - for metric in metrics: - cluster_hierarch = hierarchy.linkage(corr_pairwdist, method=metric) - # compute cophentic correlation distance - coph_corr, coph_dists = hierarchy.cophenet( - cluster_hierarch, pdist(corr_mat)) - scores[metric] = coph_corr - if coph_corr > max_coph_corr: - max_coph_corr = coph_corr - max_metric = metric - max_coph_dist = coph_dists - - cluster_hierarch = hierarchy.linkage(corr_pairwdist, method=max_metric) - - print( - "Cophentic correlation distance for metric " + max_metric + ": " + - str(max_coph_corr)) - - return cluster_hierarch, max_coph_dist, scores - - -def flatten_hierarch_clustering(corr_mat, cluster_hierarch, weights): - """! Flattens a hierarchical clustering for a (list of) maximum cophenetic - distance(s) in the flat clusters and evaluates the resulting clustering with - respect to the corresponding correlation matrix. - - @param corr_mat correlation matrix between the features / data set items - clustered hierarchically. - @param cluster_hierarch hierarchical clustering of given features / data - set items. - @param weigths Maximum cophenetic distance or list of maximum cophenetic - distances to compute the flat clustering(s). - - @return flat clustering(s) according to the (list of) maximum distance(s). - """ - - # all indices in npis_corr from 0 to n-1 - npi_indices_all = set(range(corr_mat.shape[0])) - npi_idx_to_cluster_idx_list = [] - # allow single entries - if not isinstance(weights, list): - weights = [weights] - # iterate over weights - for weight in weights: - # use the given weight to flatten the dendrogram - npi_idx_to_cluster_idx = hierarchy.fcluster( - cluster_hierarch, weight, criterion='distance') - - # evaluate clustering - evaluate_clustering(corr_mat, npi_idx_to_cluster_idx, npi_indices_all) - - # append new npi_idx to cluster_idx assignment to list of assignments - npi_idx_to_cluster_idx_list.append(npi_idx_to_cluster_idx) - - return npi_idx_to_cluster_idx_list - - def validate(df_npis_old, df_npis, df_infec_rki, countyID, npiCode, start_npi_cols, npi_incid_start, start_date_validation, end_date_validation, fine_resolution): @@ -1025,300 +881,6 @@ def transform_npi_data(fine_resolution=2, filename = 'germany_counties_npi_maincat' gd.write_dataframe(df_npis, directory, filename, file_format) - # stupid validation - # df_validation = pd.read_json(directory + filename + ".json") - # if len( - # np.where( - # df_validation.iloc[:, start_npi_cols - 1:] != df_npis.iloc - # [:, start_npi_cols - 1:])[0]) > 0: - # print('Error in file writing/reading') - npi_codes_considered = [] #which codes? - analyze_npi_data(True, True, fine_resolution, npis_final, directory, file_format, npi_codes_considered) - - -def analyze_npi_data(read_data, make_plot, fine_resolution, npis, directory, file_format, npi_codes_considered): - - if not read_data: - x = 15 - # transform_npi_data(fine_resolution=2, - # file_format=dd.defaultDict['file_format'], - # out_folder=dd.defaultDict['out_folder'], - # start_date=dd.defaultDict['start_date'], - # end_date=dd.defaultDict['end_date'], - # make_plot=dd.defaultDict['make_plot'], - # ) - - else: # read formatted file - - if fine_resolution > 0: - if fine_resolution == 1: - filename = 'germany_counties_npi_subcat_incgrouped' - else: - filename = 'germany_counties_npi_subcat' - else: - filename = 'germany_counties_npi_maincat' - df_npis = pd.read_json(directory + filename + ".json") - # get code levels (main/subcodes) and position of main codes - # code_level = [i.count('_') for i in npi_codes] - # main_code_pos = [i for i in range(len(code_level)) if code_level[i] == 1] - - # check if any other integer than 0: not implemented or 1: implemented is - # used (maybe to specify the kind of implementation) - if len(np.where(df_npis[npi_codes_considered] > 1)[0]) > 0: - - print("Info: Please ensure that NPI information is only boolean.") - - else: - # sum over different NPIs and plot share of countires implementing - # these NPIs versus counties without corresponding actions - df_npis_aggregated = df_npis.groupby( - dd.EngEng['date']).agg( - {i: sum for i in npi_codes_considered}).copy() - npis_total_sum = df_npis_aggregated.sum() - - npi_codes_empty = list(np.array(npi_codes_considered)[ - np.where(npis_total_sum == 0)[0]]) - - npi_unused_indices_all = [] - npi_used_indices_all = [] - npi_unused_indices = [] - npi_used_indices = [] - for i in range(len(npi_codes_considered)): - if npi_codes_considered[i] in npi_codes_empty: - npi_unused_indices.append(i) - npi_unused_indices_all.append( - npis[dd.EngEng['npiCode']].index(npi_codes_considered[i])) - else: - npi_used_indices.append(i) - npi_used_indices_all.append( - npis[dd.EngEng['npiCode']].index(npi_codes_considered[i])) - - npis_unused = np.array(npis[dd.EngEng['desc']])[npi_unused_indices_all] - npis_used = np.array(npis[dd.EngEng['desc']])[npi_used_indices_all] - npi_codes_used = list(np.array(npi_codes_considered)[npi_used_indices]) - npi_codes_unused = list( - np.array(npi_codes_considered)[npi_unused_indices]) - - # open file to write unused categories - if fine_resolution > 0: - if fine_resolution == 1: - filename = 'unused_subcats_incgrouped.txt' - else: - filename = 'unused_subcats.txt' - else: - filename = 'unused_maincats.txt' - file_npi = open(directory + filename, 'w') - # Writing unused NPIs - for i in range(len(npis_unused)): - file_npi.write(npi_codes_unused[i] + ": " + npis_unused[i]) - file_npi.write("\n") - # Closing file - file_npi.close() - - # open file to write unused categories - if fine_resolution > 0: - if fine_resolution == 1: - filename = 'used_subcats_incgrouped.txt' - else: - filename = 'used_subcats.txt' - else: - filename = 'used_maincats.txt' - file_npi = open(directory + filename, 'w') - # Writing unused NPIs - for i in range(len(npis_used)): - file_npi.write(npi_codes_used[i] + ": " + npis_used[i]) - file_npi.write("\n") - # Closing file - file_npi.close() - - df_npis_used = df_npis[[dd.EngEng['date'], - dd.EngEng['idCounty']] + npi_codes_used].copy() - if fine_resolution > 0: - if fine_resolution == 1: - filename = 'germany_counties_npi_subcat_used_incgrouped' - else: - filename = 'germany_counties_npi_subcat_used' - else: - filename = 'germany_counties_npi_maincat_used' - gd.write_dataframe(df_npis_used, directory, filename, file_format) - - # compute correlations - npis_corr = df_npis_used.iloc[:, 2:].corr().values - # plot log-colored correlations - plt.imshow(abs(npis_corr), cmap='gray_r') - # plot histogram - plt.figure() - plt.hist(npis_corr.flatten(), bins=50) - plt.title("Correlation histogram", fontsize=18) - plt.xlabel("Correlation", fontsize=12) - plt.ylabel("Number of values", fontsize=12) - - # We understand the rows of npis_corr, the correlations of one NPI - # to the others as one node in the #NPIs-used-dimensional space. - # We compute the pairwise distances of these nodes. Then, nodes with - # similar correlations towards all other nodes exhibit small distances - corr_pairwdist = hierarchy.distance.pdist( - npis_corr, metric='euclidean') - - # compute hierarchical clustering (via best-suited metric) - compare_metrics = True - if compare_metrics: - # centroid - metric = 'centroid' - cluster_hierarch, coph_dist, scores = compute_hierarch_clustering( - abs(npis_corr), - corr_pairwdist, - metric) - # # plot dendrogram - plt.figure() - plt.title(metric) - hierarchy.dendrogram(cluster_hierarch) - plt.show() - max_coph_dist = coph_dist.max() - flatten_hierarch_clustering( - abs(npis_corr), cluster_hierarch, - [wg * max_coph_dist - for wg in [0.6, 0.625, 0.65, 0.675, 0.7, 0.725, 0.75]]) - # ward - metric = 'ward' - cluster_hierarch, coph_dist, scores = compute_hierarch_clustering( - npis_corr, - corr_pairwdist, - metric) - # # plot dendrogram - # plt.figure() - # plt.title(metric) - # hierarchy.dendrogram(cluster_hierarch) - # plt.show() - max_coph_dist = coph_dist.max() - flatten_hierarch_clustering( - abs(npis_corr), cluster_hierarch, - [wg * max_coph_dist for wg in [0.1, 0.125, 0.15, 0.175, 0.2]]) - # average - metric = 'average' - cluster_hierarch, coph_dist, scores = compute_hierarch_clustering( - abs(npis_corr), - corr_pairwdist, - metric) - # # plot dendrogram - # plt.figure() - # plt.title(metric) - # hierarchy.dendrogram(cluster_hierarch) - # plt.show() - max_coph_dist = coph_dist.max() - flatten_hierarch_clustering( - npis_corr, cluster_hierarch, - [wg * max_coph_dist - for wg in [0.475, 0.5, 0.525, 0.55, 0.575, 0.6, 0.625, 0.65]]) - - metric = 'centroid' - cluster_hierarch, coph_dist, scores = compute_hierarch_clustering( - npis_corr, - corr_pairwdist, - metric) - # # plot dendrogram - # plt.figure() - # plt.title(metric) - # hierarchy.dendrogram(cluster_hierarch) - # plt.show() - max_coph_dist = coph_dist.max() - npi_idx_to_cluster_idx = flatten_hierarch_clustering( - npis_corr, cluster_hierarch, - [wg * max_coph_dist - for wg in [0.65]]) - - cluster_dict = dict() - cluster_codes = [[] for i in range(npi_idx_to_cluster_idx[0].max()+1)] - cluster_desc = [[] for i in range(npi_idx_to_cluster_idx[0].max()+1)] - for i in range(len(npi_idx_to_cluster_idx[0])): - cluster_dict[npi_codes_used[i] - ] = "CM_" + str(npi_idx_to_cluster_idx[0][i]).zfill(3) - cluster_codes[npi_idx_to_cluster_idx[0] - [i]].append(npi_codes_used[i]) - cluster_desc[npi_idx_to_cluster_idx[0] - [i]].append(str(npis_used[i])) - - # create clustered dataframe - df_npis_clustered = df_npis[[ - dd.EngEng['date'], dd.EngEng['idCounty']]].copy() - - for i in range(len(cluster_codes)): - df_npis_clustered["CM_" + str(i).zfill(3) - ] = df_npis[cluster_codes[i]].max(axis=1).copy() - - npis_corr_cluster = df_npis_clustered.corr() - # npis_corr_cluster[abs(npis_corr_cluster)<0.25] = 0 - plt.imshow(abs(npis_corr_cluster), cmap='gray_r') - plt.title('Absolute correlation>0.25 of clustered NPIs') - plt.xlabel('NPI cluster') - plt.ylabel('NPI cluster') - plt.colorbar() - - # open file to write unused categories - if fine_resolution > 0: - if fine_resolution == 1: - filename = 'clusters_subcats_incgrouped.txt' - else: - filename = 'clusters_subcats.txt' - else: - filename = 'clusters_maincats.txt' - file_npi = open(directory + filename, 'w') - # Writing unused NPIs - for i in range(len(cluster_codes)): - file_npi.write("Cluster " + str(i) + "\n") - for j in range(len(cluster_codes[i])): - file_npi.write(cluster_codes[i][j] + ": " + cluster_desc[i][j]) - file_npi.write("\n") - file_npi.write("\n") - # Closing file - file_npi.close() - - npi_idx_new = np.argsort(npi_idx_to_cluster_idx[0]) - npis_corr_reorder = npis_corr[npi_idx_new, :][:, npi_idx_new] - - plt.imshow(abs(npis_corr_reorder), cmap='gray_r') - plt.colorbar() - - # npi_indices_all = set(range(npis_corr.shape[0])) - # for i in [40]:#[10, 20, 40, 80, 160]: - # kmeans_npis = KMeans(n_clusters=i).fit(df_npis_used.iloc[:,2:].T) - # evaluate_clustering(npis_corr, kmeans_npis.labels_, npi_indices_all) - - # for i in [40]:#[10, 20, 40, 80, 160]: - # kmeans_corr = KMeans(n_clusters=i).fit(npis_corr) - # evaluate_clustering(npis_corr, kmeans_corr.labels_, npi_indices_all) - - # corr_threshold = 0.5 - # corr_indices_threshold = np.where(npis_corr > corr_threshold) - # npis_corr_threshold = np.zeros(npis_corr.shape) - # npis_corr_threshold[corr_indices_threshold] = npis_corr[corr_indices_threshold] - # plt.imshow(npis_corr_threshold, cmap='gray_r') - - # plot share of counties that implement the main categories - if make_plot: - # plot four different subsets of curves for better distinction - j = 0 - if fine_resolution > 0: - num_images = 15 - else: - num_images = 1 - for i in [ - slice( - int(len(npi_codes_used) / num_images) * i, - min( - int(len(npi_codes_used) / num_images) * - (i + 1), - len(npis_used))) for i in range( - num_images + 1)]: - customPlot.plotList(df_npis_aggregated.index, - [df_npis_aggregated[code] - for code in npi_codes_used[i]], - npis_used[i], - 'Counties implementing NPI main categories', - 'Date', 'Number', "Counties_NPI_main_" + - str(j) + "_of_"+str(num_images)) - j += 1 - def main(): """! Main program entry.""" From ebe238dda18a32db9c1f9cab499a6b0f78216cfd Mon Sep 17 00:00:00 2001 From: patricklnz Date: Mon, 9 Jan 2023 14:25:46 +0100 Subject: [PATCH 039/104] rename transformNPIData -> getNPIData --- .../memilio/epidata/{transformNPIData.py => getNPIData.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename pycode/memilio-epidata/memilio/epidata/{transformNPIData.py => getNPIData.py} (100%) diff --git a/pycode/memilio-epidata/memilio/epidata/transformNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py similarity index 100% rename from pycode/memilio-epidata/memilio/epidata/transformNPIData.py rename to pycode/memilio-epidata/memilio/epidata/getNPIData.py From 53270737f19fc7beb20188128d469f6f5018838a Mon Sep 17 00:00:00 2001 From: patricklnz Date: Tue, 10 Jan 2023 14:38:09 +0100 Subject: [PATCH 040/104] restructure and add first npi tests --- .../memilio/epidata/getNPIData.py | 238 ++++++++++-------- .../epidata_test/test_epidata_getNPIData.py | 61 +++++ 2 files changed, 189 insertions(+), 110 deletions(-) create mode 100644 pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 1bf9b65ea8..24bf5bae2b 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -108,63 +108,9 @@ def print_manual_download(filename, url): '. Then move it to a folder named raw_data in this directory.') -def transform_npi_data(fine_resolution=2, - file_format=dd.defaultDict['file_format'], - out_folder=dd.defaultDict['out_folder'], - start_date=dd.defaultDict['start_date'], - end_date=dd.defaultDict['end_date'], - counties_considered=geoger.get_county_ids() - ): - """! Loads a certain resolution of recorded NPI data from - the Corona Datenplattform and transforms it according to the - arguments given. - - For full functionality, please manually download - - kr_massnahmen_unterkategorien.csv - - datensatzbeschreibung_massnahmen.xlsx - from https://www.corona-datenplattform.de/dataset/massnahmen_unterkategorien_kreise - and - - kr_massnahmen_oberkategorien.csv - from https://www.corona-datenplattform.de/dataset/massnahmen_oberkategorien_kreise - and move it to the *directory*-path mentioned in the beginning of the function. - - @param fine_resolution 2 [Default] or 0 or 1. Defines which categories - are considered. - If '2' is set, all the subcategories (~1200) are considered. - If '1' is set, all incidence levels of subcategories are merged and - ~200 NPIs are considered. - If '0' is chosen only the main, summarizing categories (~20) are used. - @param file_format File format which is used for writing the data. - Default defined in defaultDict. - @param out_folder Path to folder where data is written in folder - out_folder/Germany. - @param start_date [Default = '', taken from read data] Start date - of stored data frames. - @param end_date [Default = '', taken from read data] End date of - stored data frames. - @param make_plot False [Default] or True. Defines if plots are - generated with matplotlib. - @param moving_average 0 [Default] or Number>0. Defines the number of - days for which a centered moving average is computed. - """ - - directory = out_folder - directory = os.path.join(directory, 'Germany/') - gd.check_dir(directory) - +def read_files(directory, fine_resolution): + """Downloads files and stores data in dataframes named df_npis_old, df_npis_desc""" if fine_resolution > 0: - # defines delay in number of days between exceeding - # incidence threshold and NPI getting active - # delay = 0 means only one day is considered (=no delay) - npi_activation_delay = 2 - npi_lifting_delay = 4 # for NRW, BW - # 2 for bayern - # we use npi_lifting_delay = 4 as this is the most common - print('Using a delay of NPI activation of ' + - str(npi_activation_delay) + ' days.') - print('Using a delay of NPI lifting of ' + - str(npi_lifting_delay) + ' days.') - try: df_npis_old = pd.read_csv( os.path.join(directory, 'kr_massnahmen_unterkategorien.csv'), @@ -217,6 +163,127 @@ def transform_npi_data(fine_resolution=2, 'https://www.corona-datenplattform.de/dataset/massnahmen_unterkategorien_kreise') raise FileNotFoundError + return df_npis_old, df_npis_desc + + +def activate_npis_based_on_incidence(local_incid, npi_lifting_delay, npi_activation_delay, incid_threshold): + """Warning: delay = 0 does NOT work!!!""" + # NPI can only be activated or liftet the day AFTER + # incidence is below/over threshold for N days. The + # incidence on day N only effects the NPI on day N+1 and + # NOT ON day N. Therefore we shift the incidence one day forward + # to match the indices of our dataframe df_local_new so that + # the NPIs can be calculated on the respective day. + # + # Example (threshold=3.5): + # local_incid=pd.Series([4,2,4,2,2,4,4,2,4,2,2,2,2]) + # Yesterdays incidence is over the threshold on following days: + # [?,1,0,1,0,0,1,1,0,1,0,0,0] + # The first day is not known and always set to the first days value. + # [1,1,0,1,0,0,1,1,0,1,0,0,0] + + # First get a Series with 0 for yesterdays incidence + # is below threshold and 1 for incidence over threshold + yesterdays_incid_over_threshold = (local_incid.shift( + 1).fillna(local_incid[0]) > incid_threshold).astype(int) # maybe >= + + # If incidence is above threshold for + # 1+npi_activation_delay days, the NPI gets activated. + # Similarly, if incidence is below threshold for + # 1+npi_lifting_delay days, the NPI is lifted. + # + # Example: + # With yesterdays incidence over threshold on days: + # [0,1,0,1,0,0,1,1,0,1,0,0,0] + # npi_lifting_delay=2, npi_activation_delay=1 + # NPI should be activated on day 8 and lifted on day 13 + # int_active should then be: + # [0,0,0,0,0,0,0,1,1,1,1,1,0] + # + # With yesterdays incidence over threshold on days: + # [1,1,0,1,0,0,1,1,0,1,0,0,0] (as above) + # NPI should be activated on day 2 and lifted on day 13 + # int_active should then be: + # [0,1,1,1,1,1,1,1,1,1,1,1,0] + + # get a zero filled Series with same length to be + # filled with ones where NPI is active + int_active = pd.Series(np.zeros(len(local_incid), dtype=int)) + # loop over every day + for i in range(len(yesterdays_incid_over_threshold)): + # Set int_active=0 where last npi_lifting_delay+1 days are 0 + if yesterdays_incid_over_threshold[i-npi_lifting_delay:i+1].values.sum() == 0: + int_active[i] = 0 + # Set int_active=1 where last npi_activation_delay+1 days are all 1 + elif yesterdays_incid_over_threshold[i-npi_activation_delay:i+1].values.sum() == npi_activation_delay+1: + int_active[i] = 1 + # If no condition applies, set int_active to the value of the previous day + else: + int_active[i] = int_active[i-1] + + return int_active + + +def get_npi_data(fine_resolution=2, + file_format=dd.defaultDict['file_format'], + out_folder=dd.defaultDict['out_folder'], + start_date=dd.defaultDict['start_date'], + end_date=dd.defaultDict['end_date'], + counties_considered=geoger.get_county_ids() + ): + """! Loads a certain resolution of recorded NPI data from + the Corona Datenplattform and transforms it according to the + arguments given. + + For full functionality, please manually download + - kr_massnahmen_unterkategorien.csv + - datensatzbeschreibung_massnahmen.xlsx + from https://www.corona-datenplattform.de/dataset/massnahmen_unterkategorien_kreise + and + - kr_massnahmen_oberkategorien.csv + from https://www.corona-datenplattform.de/dataset/massnahmen_oberkategorien_kreise + and move it to the *directory*-path mentioned in the beginning of the function. + + @param fine_resolution 2 [Default] or 0 or 1. Defines which categories + are considered. + If '2' is set, all the subcategories (~1200) are considered. + If '1' is set, all incidence levels of subcategories are merged and + ~200 NPIs are considered. + If '0' is chosen only the main, summarizing categories (~20) are used. + @param file_format File format which is used for writing the data. + Default defined in defaultDict. + @param out_folder Path to folder where data is written in folder + out_folder/Germany. + @param start_date [Default = '', taken from read data] Start date + of stored data frames. + @param end_date [Default = '', taken from read data] End date of + stored data frames. + @param make_plot False [Default] or True. Defines if plots are + generated with matplotlib. + @param moving_average 0 [Default] or Number>0. Defines the number of + days for which a centered moving average is computed. + """ + + directory = out_folder + directory = os.path.join(directory, 'Germany/') + gd.check_dir(directory) + + if fine_resolution > 0: + # defines delay in number of days between exceeding + # incidence threshold and NPI getting active + # delay = 0 means only one day is considered (=no delay) + npi_activation_delay = 2 + npi_lifting_delay = 4 # for NRW, BW + # 2 for bayern + # we use npi_lifting_delay = 4 as this is the most common + print('Using a delay of NPI activation of ' + + str(npi_activation_delay) + ' days.') + print('Using a delay of NPI lifting of ' + + str(npi_lifting_delay) + ' days.') + + # read manual downloaded files from directory + df_npis_old, df_npis_desc = read_files(directory, fine_resolution) + # get existing codes that are used (in df_npis_old M22-M24 are empty) npi_codes_prior = df_npis_desc['Variablenname'] npi_codes_prior_desc = df_npis_desc['Variable'] @@ -674,58 +741,9 @@ def transform_npi_data(fine_resolution=2, for level, npi_indices in incidence_thresholds_to_npis.items(): if level[0] >= 0: # level[0] = incidvalthrsh local_incid = df_infec_local['Incidence'].copy() - # NPI can only be activated or liftet the day AFTER - # incidence is below/over threshold for N days. The - # incidence on day N only effects the NPI on day N+1 and - # NOT ON day N. Therefore we shift the incidence one day forward - # to match the indices of our dataframe df_local_new so that - # the NPIs can be calculated on the respective day. - # - # Example (threshold=3.5): - # local_incid=pd.Series([4,2,4,2,2,4,4,2,4,2,2,2,2]) - # Yesterdays incidence is over the threshold on following days: - # [?,1,0,1,0,0,1,1,0,1,0,0,0] - # The first day is not known and always set to the first days value. - # [1,1,0,1,0,0,1,1,0,1,0,0,0] - - # First get a Series with 0 for yesterdays incidence - # is below threshold and 1 for incidence over threshold - yesterdays_incid_over_threshold = (local_incid.shift( - 1).fillna(local_incid[0]) > level[0]).astype(int) - - # If incidence is above threshold for - # 1+npi_activation_delay days, the NPI gets activated. - # Similarly, if incidence is below threshold for - # 1+npi_lifting_delay days, the NPI is lifted. - # - # Example: - # With yesterdays incidence over threshold on days: - # [0,1,0,1,0,0,1,1,0,1,0,0,0] - # npi_lifting_delay=2, npi_activation_delay=1 - # NPI should be activated on day 8 and lifted on day 13 - # int_active should then be: - # [0,0,0,0,0,0,0,1,1,1,1,1,0] - # - # With yesterdays incidence over threshold on days: - # [1,1,0,1,0,0,1,1,0,1,0,0,0] (as above) - # NPI should be activated on day 2 and lifted on day 13 - # int_active should then be: - # [0,1,1,1,1,1,1,1,1,1,1,1,0] - - # get a zero filled Series with same length to be - # filled with ones where NPI is active - int_active = pd.Series(np.zeros(len(local_incid), dtype=int)) - # loop over every day - for i in range(len(yesterdays_incid_over_threshold)): - # Set int_active=0 where last npi_lifting_delay+1 days are 0 - if yesterdays_incid_over_threshold[i-npi_lifting_delay:i+1].values.sum() == 0: - int_active[i] = 0 - # Set int_active=1 where last npi_activation_delay+1 days are all 1 - elif yesterdays_incid_over_threshold[i-npi_activation_delay:i+1].values.sum() == npi_activation_delay+1: - int_active[i] = 1 - # If no condition applies, set int_active to the value of the previous day - else: - int_active[i] = int_active[i-1] + + # get days where npis are active as int (1/0) + int_active = activate_npis_based_on_incidence(local_incid, npi_lifting_delay, npi_activation_delay, level[0]) # multiply rows of data frame by either 1 if threshold # passed (i.e., mentioned NPI is active) or zero @@ -886,9 +904,9 @@ def main(): """! Main program entry.""" # arg_dict = gd.cli("testing") - transform_npi_data(fine_resolution=2) + get_npi_data(fine_resolution=2) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py new file mode 100644 index 0000000000..4772870aa1 --- /dev/null +++ b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py @@ -0,0 +1,61 @@ +###################################################################### +# Copyright (C) 2020-2021 German Aerospace Center (DLR-SC) +# +# Authors: +# +# Contact: Martin J. Kuehn +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +###################################################################### +import unittest +from unittest.mock import patch, call +from pyfakefs import fake_filesystem_unittest + +import pandas as pd +import numpy as np + +from memilio.epidata import getDataIntoPandasDataFrame as gd +from memilio.epidata import getNPIData as gnd +from memilio.epidata import defaultDict as dd + + +class TestGetNPIData(fake_filesystem_unittest.TestCase): + maxDiff = None + + path = '/home/NPIData' + + incid_no_delay = pd.Series([1,2,3,4,1,1,0,5,0,1,2,3,5,1,0,2,4,5,6]) + test_series_no_delay = pd.Series([0,0,0,1,1,1,0,0,0,0,0,0,1,1,1,0,0,1,1]) + + def setUp(self): + self.setUpPyfakefs() + + # first test of everything + # should not raise any errors + # to be deleted if enough tests are written + def test_get_npi_data(self, mockv): + gnd.get_npi_data(out_folder=self.path) + + def test_activate_npis_based_on_threshold(self): + threshold=1.5 + # test with delay = 1 ; int should be active two days after incid > 1.5 + # should start with 0 since first value of incid no delay = 1 < 1.5 + npi_activation_delay = 1 + npi_lifting_deay = 1 + int_active_no_delay = gnd.activate_npis_based_on_incidence( + self.incid_no_delay, npi_lifting_deay, npi_activation_delay, threshold) + self.assertEqual(int_active_no_delay.to_list(), self.test_series_no_delay.to_list()) + + +if __name__ == '__main__': + unittest.main() From 505e7d3b4b7e796df55c0cdf56733684665748ae Mon Sep 17 00:00:00 2001 From: patricklnz Date: Tue, 10 Jan 2023 18:08:05 +0100 Subject: [PATCH 041/104] add more tests --- .../memilio/epidata/getNPIData.py | 143 +++++++------- .../epidata_test/test_epidata_getNPIData.py | 174 ++++++++++++++++-- 2 files changed, 234 insertions(+), 83 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 24bf5bae2b..9adc9b9861 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -224,6 +224,70 @@ def activate_npis_based_on_incidence(local_incid, npi_lifting_delay, npi_activat return int_active +def drop_codes_and_categories(npi_codes_prior, npi_codes_prior_desc, df_npis_old, fine_resolution): + if fine_resolution > 0: + # correct M04_N codes to M_04_M_N, N in {1,...,5}, M in {120,110,100,130,140} + # (M04_1, i.e. i=1, has been corrected in original file but not for i>1) + for i in range(2, 6): + npi_codes_prior[npi_codes_prior == 'M04_'+str(i)] = ['M04_120_'+str( + i), 'M04_110_'+str(i), 'M04_100_'+str(i), 'M04_130_'+str(i), 'M04_140_'+str(i)] + + # correct M05_N codes to M_05_M_N, N in {1,...,5}, M in {130,150,120,140,110,100,160} + for i in range(1, 6): + npi_codes_prior[npi_codes_prior == 'M05_'+str(i)] = ['M05_130_'+str(i), 'M05_150_'+str( + i), 'M05_120_'+str(i), 'M05_140_'+str(i), 'M05_110_'+str(i), 'M05_100_'+str(i), 'M05_160_'+str(i)] + + # correct 'M16_200_2' to missing 'M16_100_2' + npi_codes_prior[npi_codes_prior == 'M16_200_2'] = 'M16_100_2' + + # check for missing codes + npi_codes_prior_data = df_npis_old[dd.EngEng['npiCode']].unique() + + missing_codes = list(set(npi_codes_prior).difference( + npi_codes_prior_data)) + if len(missing_codes) > 0: + # if incidence is grouped, only search for grouping codes without + # having a detailed "_DETAIL" naming as of MCODE_NUMBER_DETAIL + if fine_resolution == 1: + missing_grouped_codes = [] + for mcode in missing_codes: + # only consider incidence independent npis + # only exit if one of these (i.e., MCODE_NUMBER) is missing + if len(mcode.split('_')) != 3: + missing_grouped_codes.append(mcode) + if len(missing_grouped_codes) > 0: # only MCODE_NUMBER codes + raise gd.DataError('Missing NPI codes: ' + + str(missing_grouped_codes)) + else: + raise gd.DataError('Missing NPI codes: ' + str(missing_codes)) + + # we dont have any explanations from "datensatzbeschreibung_massnahmen" + # on these codes, so drop the rows. + codes_dropped = list(set(npi_codes_prior_data).difference( + npi_codes_prior)) + # also remove dummy 'Platzhalter' categories + dummy_categories = [] + for i in range(len(npi_codes_prior)): + if 'Platzhalter' in npi_codes_prior_desc[i]: + dummy_categories.append(npi_codes_prior[i]) + # codes without explanation and dummy categories + # sorting done for consistenty, maybe not necessary + codes_dropped = list(np.sort(codes_dropped + dummy_categories)) + if len(codes_dropped) > 0: + df_npis_old = df_npis_old[~df_npis_old[dd.EngEng['npiCode']].isin( + codes_dropped)].reset_index(drop=True) + # for every main code removed, all 5 subcodes have to be removed; + # if this is not the case, the naming of them is wrong/not consistent + if (len(codes_dropped) % 6) != 0: + raise gd.DataError('Error in NPI names, please check.') + else: + # no dropping for fine_resolution == 0 + codes_dropped=[] + + return codes_dropped, npi_codes_prior, df_npis_old + + + def get_npi_data(fine_resolution=2, file_format=dd.defaultDict['file_format'], out_folder=dd.defaultDict['out_folder'], @@ -272,10 +336,13 @@ def get_npi_data(fine_resolution=2, # defines delay in number of days between exceeding # incidence threshold and NPI getting active # delay = 0 means only one day is considered (=no delay) - npi_activation_delay = 2 - npi_lifting_delay = 4 # for NRW, BW - # 2 for bayern - # we use npi_lifting_delay = 4 as this is the most common + npi_activation_delay = 3 + npi_lifting_delay = 5 + # depending on the federal state and time period, there are + # huge deviations of the lifting and activation delay which was usually + # between 1 and 14 days + # we use npi_lifting_delay = 5 and npi_activation_delay = 3 + # as this is the most common and has at some point been used in almost every county print('Using a delay of NPI activation of ' + str(npi_activation_delay) + ' days.') print('Using a delay of NPI lifting of ' + @@ -392,62 +459,8 @@ def get_npi_data(fine_resolution=2, writer.save() # correct differences in codes between data sheet and explanation sheet - codes_dropped = [] # no dropping for fine_resolution == 0 - if fine_resolution > 0: - # correct M04_N codes to M_04_M_N, N in {1,...,5}, M in {120,110,100,130,140} - # (M04_1, i.e. i=1, has been corrected in original file but not for i>1) - for i in range(2, 6): - npi_codes_prior[npi_codes_prior == 'M04_'+str(i)] = ['M04_120_'+str( - i), 'M04_110_'+str(i), 'M04_100_'+str(i), 'M04_130_'+str(i), 'M04_140_'+str(i)] - - # correct M05_N codes to M_05_M_N, N in {1,...,5}, M in {130,150,120,140,110,100,160} - for i in range(1, 6): - npi_codes_prior[npi_codes_prior == 'M05_'+str(i)] = ['M05_130_'+str(i), 'M05_150_'+str( - i), 'M05_120_'+str(i), 'M05_140_'+str(i), 'M05_110_'+str(i), 'M05_100_'+str(i), 'M05_160_'+str(i)] - - # correct 'M16_200_2' to missing 'M16_100_2' - npi_codes_prior[npi_codes_prior == 'M16_200_2'] = 'M16_100_2' - - # check for missing codes - npi_codes_prior_data = df_npis_old[dd.EngEng['npiCode']].unique() - - missing_codes = list(set(npi_codes_prior).difference( - npi_codes_prior_data)) - if len(missing_codes) > 0: - # if incidence is grouped, only search for grouping codes without - # having a detailed "_DETAIL" naming as of MCODE_NUMBER_DETAIL - if fine_resolution == 1: - missing_grouped_codes = [] - for mcode in missing_codes: - # only consider incidence independent npis - # only exit if one of these (i.e., MCODE_NUMBER) is missing - if len(mcode.split('_')) != 3: - missing_grouped_codes.append(mcode) - if len(missing_grouped_codes) > 0: # only MCODE_NUMBER codes - sys.exit('Missing NPI codes: ' + - str(missing_grouped_codes)) - else: - sys.exit('Missing NPI codes: ' + str(missing_codes)) - - # we dont have any explanations from "datensatzbeschreibung_massnahmen" - # on these codes, so drop the rows. - codes_dropped = list(set(npi_codes_prior_data).difference( - npi_codes_prior)) - # also remove dummy 'Platzhalter' categories - dummy_categories = [] - for i in range(len(npi_codes_prior)): - if 'Platzhalter' in npi_codes_prior_desc[i]: - dummy_categories.append(npi_codes_prior[i]) - # codes without explanation and dummy categories - # sorting done for consistenty, maybe not necessary - codes_dropped = list(np.sort(codes_dropped + dummy_categories)) - if len(codes_dropped) > 0: - df_npis_old = df_npis_old[~df_npis_old[dd.EngEng['npiCode']].isin( - codes_dropped)].reset_index(drop=True) - # for every main code removed, all 5 subcodes have to be removed; - # if this is not the case, the naming of them is wrong/not consistent - if (len(codes_dropped) % 6) != 0: - sys.exit('Error in NPI names, please check.') + codes_dropped, npi_codes_prior, df_npis_old = drop_codes_and_categories( + npi_codes_prior, npi_codes_prior_desc, df_npis_old, fine_resolution) # sort NPI codes according to numeric values (argsort gives indices # in input list to be used for sorted array) @@ -522,7 +535,7 @@ def get_npi_data(fine_resolution=2, npi_codes_aggregated = [] for main_code in maincode_to_npicodes_map.keys(): if main_code.count('_') > 1: - sys.exit('Error. Subcode assigned as main code.') + raise gd.DataError('Error. Subcode assigned as main code.') npi_codes_aggregated.append(main_code) npis_final = npis[npis[dd.EngEng['npiCode']].isin( @@ -542,7 +555,7 @@ def get_npi_data(fine_resolution=2, incid_threshold = int( npis.loc[i, dd.EngEng['desc']].split(' ')[1]) else: - sys.exit( + raise gd.DataError( 'Error in description file. NPI activation can not ' 'be computed. Exiting.') npi_incid_start[npis.loc[i, dd.EngEng['npiCode']] @@ -560,7 +573,7 @@ def get_npi_data(fine_resolution=2, (threshold, '_' + code.split('_')[2])) for i in range(len(incidence_thresholds)-1): if incidence_thresholds[i][0] > incidence_thresholds[i+1][0]: - sys.exit('List needs to be sorted.') + raise gd.DataError('List needs to be sorted.') # create hash map from thresholds to NPI indices incidence_thresholds_to_npis = dict( @@ -580,7 +593,7 @@ def get_npi_data(fine_resolution=2, ~df_npis_old[dd.EngEng['idCounty']].isin(counties_considered)][ dd.EngEng['idCounty']].unique() if list(counties_removed) != [16056]: - sys.exit('Error. Other counties than that of Eisenach were removed.') + raise gd.DataError('Error. Other counties than that of Eisenach were removed.') # remove rows for Eisenach df_npis_old = df_npis_old[df_npis_old[dd.EngEng['idCounty']].isin( counties_considered)].reset_index(drop=True) @@ -606,7 +619,7 @@ def get_npi_data(fine_resolution=2, print( "\t - From " + str(dates_new[i] + timedelta(1)) + " until " + str(dates_new[i] + timedelta(date_diff[i] - 1))) - sys.exit('Exiting. Dates missing in data frame.') + raise gd.DataError('Exiting. Dates missing in data frame.') min_date = [] max_date = [] diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py index 4772870aa1..cb01c0eefd 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py @@ -18,43 +18,181 @@ # limitations under the License. ###################################################################### import unittest -from unittest.mock import patch, call from pyfakefs import fake_filesystem_unittest import pandas as pd -import numpy as np -from memilio.epidata import getDataIntoPandasDataFrame as gd from memilio.epidata import getNPIData as gnd from memilio.epidata import defaultDict as dd - +from memilio.epidata import getDataIntoPandasDataFrame as gd class TestGetNPIData(fake_filesystem_unittest.TestCase): maxDiff = None path = '/home/NPIData' - incid_no_delay = pd.Series([1,2,3,4,1,1,0,5,0,1,2,3,5,1,0,2,4,5,6]) - test_series_no_delay = pd.Series([0,0,0,1,1,1,0,0,0,0,0,0,1,1,1,0,0,1,1]) + incid = pd.Series( + [1, 2, 3, 4, 1, 1, 0, 5, 0, 1, 2, 3, 5, 1, 0, 2, 4, 5, 6]) + active = pd.Series( + [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1]) + + incid_start_above_threshold = pd.Series( + [2, 2, 3, 4, 1, 1, 0, 5, 0, 1, 2, 3, 5, 1, 0, 2, 4, 5, 6]) + active_start_above_threshold = pd.Series( + [0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1]) + + codes_to_correct = [ # should be renamed: + 'M04_2', 'M04_2', 'M04_2', 'M04_2', 'M04_2', + 'M04_3', 'M04_3', 'M04_3', 'M04_3', 'M04_3', + 'M04_4', 'M04_4', 'M04_4', 'M04_4', 'M04_4', + 'M04_5', 'M04_5', 'M04_5', 'M04_5', 'M04_5', + 'M05_1', 'M05_1', 'M05_1', 'M05_1', 'M05_1', 'M05_1', 'M05_1', + 'M05_2', 'M05_2', 'M05_2', 'M05_2', 'M05_2', 'M05_2', 'M05_2', + 'M05_3', 'M05_3', 'M05_3', 'M05_3', 'M05_3', 'M05_3', 'M05_3', + 'M05_4', 'M05_4', 'M05_4', 'M05_4', 'M05_4', 'M05_4', 'M05_4', + 'M05_5', 'M05_5', 'M05_5', 'M05_5', 'M05_5', 'M05_5', 'M05_5', + 'M16_200_2', + # should not be renamed/dropped: + 'M04_120', 'M04_110', 'M04_100', 'M04_130', 'M04_140', + 'M01_100', 'M01_110', 'M01_120', 'M01_130', 'M01_140', + 'M01_100_1', 'M01_110_1', 'M01_120_1', 'M01_130_1', 'M01_140_1', + 'M01_100_2', 'M01_110_2', 'M01_120_2', 'M01_130_2', 'M01_140_2', + 'M01_100_3', 'M01_110_3', 'M01_120_3', 'M01_130_3', 'M01_140_3', + 'M01_100_4', 'M01_110_4', 'M01_120_4', 'M01_130_4', 'M01_140_4', + 'M01_100_5', 'M01_110_5', 'M01_120_5', 'M01_130_5', 'M01_140_5', + ] + + corrected_codes = [ # renamed codes: + 'M04_120_2', 'M04_110_2', 'M04_100_2', 'M04_130_2', 'M04_140_2', + 'M04_120_3', 'M04_110_3', 'M04_100_3', 'M04_130_3', 'M04_140_3', + 'M04_120_4', 'M04_110_4', 'M04_100_4', 'M04_130_4', 'M04_140_4', + 'M04_120_5', 'M04_110_5', 'M04_100_5', 'M04_130_5', 'M04_140_5', + 'M05_130_1', 'M05_150_1', 'M05_120_1', 'M05_140_1', 'M05_110_1', + 'M05_100_1', 'M05_160_1', 'M05_130_2', 'M05_150_2', 'M05_120_2', + 'M05_140_2', 'M05_110_2', 'M05_100_2', 'M05_160_2', 'M05_130_3', + 'M05_150_3', 'M05_120_3', 'M05_140_3', 'M05_110_3', 'M05_100_3', + 'M05_160_3', 'M05_130_4', 'M05_150_4', 'M05_120_4', 'M05_140_4', + 'M05_110_4', 'M05_100_4', 'M05_160_4', 'M05_130_5', 'M05_150_5', + 'M05_120_5', 'M05_140_5', 'M05_110_5', 'M05_100_5', 'M05_160_5', + 'M16_100_2', + # not changed codes: + 'M04_120', 'M04_110', 'M04_100', 'M04_130', 'M04_140', + 'M01_100', 'M01_110', 'M01_120', 'M01_130', 'M01_140', + 'M01_100_1', 'M01_110_1', 'M01_120_1', 'M01_130_1', 'M01_140_1', + 'M01_100_2', 'M01_110_2', 'M01_120_2', 'M01_130_2', 'M01_140_2', + 'M01_100_3', 'M01_110_3', 'M01_120_3', 'M01_130_3', 'M01_140_3', + 'M01_100_4', 'M01_110_4', 'M01_120_4', 'M01_130_4', 'M01_140_4', + 'M01_100_5', 'M01_110_5', 'M01_120_5', 'M01_130_5', 'M01_140_5', ] + + codes_to_drop = ['M02_120', 'M02_120_1', 'M02_120_2', + 'M02_120_3', 'M02_120_4', 'M02_120_5', + 'M07_100', 'M07_100_1', 'M07_100_2', + 'M07_100_3', 'M07_100_4', 'M07_100_5'] + + missing_codes = ['M02_120_1', 'M02_120_2', + 'M02_120_3', 'M02_120_4', 'M02_120_5', + 'M07_100_1', 'M07_100_2', + 'M07_100_3', 'M07_100_4', 'M07_100_5'] def setUp(self): self.setUpPyfakefs() - # first test of everything - # should not raise any errors - # to be deleted if enough tests are written - def test_get_npi_data(self, mockv): - gnd.get_npi_data(out_folder=self.path) - def test_activate_npis_based_on_threshold(self): - threshold=1.5 + threshold = 1.5 # test with delay = 1 ; int should be active two days after incid > 1.5 - # should start with 0 since first value of incid no delay = 1 < 1.5 + # should start with 0 since first value of incid_delay_1 = 1 < 1.5 npi_activation_delay = 1 - npi_lifting_deay = 1 - int_active_no_delay = gnd.activate_npis_based_on_incidence( - self.incid_no_delay, npi_lifting_deay, npi_activation_delay, threshold) - self.assertEqual(int_active_no_delay.to_list(), self.test_series_no_delay.to_list()) + npi_lifting_delay = 1 + int_active = gnd.activate_npis_based_on_incidence( + self.incid, npi_lifting_delay, npi_activation_delay, threshold) + self.assertEqual( + int_active.to_list(), + self.active.to_list()) + + # test same data set with first value of incid_delay_1 = 2 > 1.5 + int_active_start_above_threshold = gnd.activate_npis_based_on_incidence( + self.incid_start_above_threshold, npi_lifting_delay, + npi_activation_delay, threshold) + self.assertEqual( + int_active_start_above_threshold.to_list(), + self.active_start_above_threshold.to_list()) + + # TODO maybe test bigger data sets and npi_activation_delay != npi_lifting_delay but both >= 3 + + def test_drop_codes_and_categories(self): + # test with no dropped codes or categories + # just rename + fine_resolution = 2 + npi_codes_prior_test = pd.Series(self.codes_to_correct) + # create dataframe with corrected columns + # just one column, no data + df_npis_old_test = pd.DataFrame( + columns=[dd.EngEng['npiCode']], + data=self.corrected_codes) + # for this test no categories should be removed + # no 'Platzhalter' in the description + npi_codes_prior_desc_test = pd.Series( + ['-' for i in range(len(self.codes_to_correct))]) + # work with copies to not change the original data + codes_dropped, npi_codes_prior, df_npis_old = gnd.drop_codes_and_categories( + npi_codes_prior_test.copy(), npi_codes_prior_desc_test.copy(), df_npis_old_test.copy(), fine_resolution) + # no codes should be dropped + self.assertEqual(codes_dropped, []) + # codes should now be corrected + self.assertEqual(npi_codes_prior.to_list(), self.corrected_codes) + # dataframe should not have changed + pd.testing.assert_frame_equal(df_npis_old, df_npis_old_test) + + # now test with codes dropped + # npi_codes_prior and npi_codes_prior_desc same as above + # only some codes added to df_npis_old which should be removed. + df_npis_old_test_cd = pd.DataFrame( + columns=[dd.EngEng['npiCode']], + data=self.corrected_codes+self.codes_to_drop) + # work with copies to not change the original data + codes_dropped, npi_codes_prior, df_npis_old = gnd.drop_codes_and_categories( + npi_codes_prior_test.copy(), npi_codes_prior_desc_test.copy(), df_npis_old_test_cd.copy(), fine_resolution) + # codes should be dropped + self.assertEqual(codes_dropped, self.codes_to_drop) + # codes should now be corrected as above + self.assertEqual(npi_codes_prior.to_list(), self.corrected_codes) + # dataframe should have changed (expect AssertionError) + with self.assertRaises(AssertionError): + pd.testing.assert_frame_equal(df_npis_old, df_npis_old_test_cd) + + # test handling of missing codes + # more codes in npi_codes_prior than in df + npi_codes_prior_test_mc = pd.Series(self.codes_to_correct + self.missing_codes) + # create dataframe with corrected columns + # just one column, no data + df_npis_old_test = pd.DataFrame( + columns=[dd.EngEng['npiCode']], + data=self.corrected_codes) + # for this test no categories should be removed + # no 'Platzhalter' in the description + npi_codes_prior_desc_test_mc = pd.Series( + ['-' for i in range(len(self.codes_to_correct+ self.missing_codes))]) + + # with fine_resolution = 2 every missing code should raise a DataError + with self.assertRaises(gd.DataError): + codes_dropped, npi_codes_prior, df_npis_old = gnd.drop_codes_and_categories( + npi_codes_prior_test_mc.copy(), npi_codes_prior_desc_test_mc.copy(), df_npis_old_test.copy(), fine_resolution) + + # fine_resolution = 1 should handle missing codes and + fine_resolution=1 + codes_dropped, npi_codes_prior, df_npis_old = gnd.drop_codes_and_categories( + npi_codes_prior_test_mc.copy(), npi_codes_prior_desc_test_mc.copy(), df_npis_old_test.copy(), fine_resolution) + # no codes should be dropped + self.assertEqual(codes_dropped, []) + # codes should now be corrected + self.assertEqual( + npi_codes_prior.to_list(), + self.corrected_codes + self.missing_codes) + self.assertNotEqual(npi_codes_prior.to_list(), self.corrected_codes) + # dataframe should not have changed + pd.testing.assert_frame_equal(df_npis_old, df_npis_old_test) + + # TODO test Platzhalter if __name__ == '__main__': From 1453418e0fe042a1fb1c47351ca4715ad9ae935f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=BChn?= Date: Wed, 11 Jan 2023 11:20:14 +0100 Subject: [PATCH 042/104] correction for activation and comments --- .../memilio/epidata/getNPIData.py | 127 +++++++++++++----- 1 file changed, 97 insertions(+), 30 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 9adc9b9861..09446b7f11 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -39,6 +39,17 @@ def validate(df_npis_old, df_npis, df_infec_rki, countyID, npiCode, """! Validates the transformed NPI data based on read in NPI data list. Also works for incidence-dependent NPIs as long as no activation or lifting delay is used. + @param df_npis_old Original data frame. + @param df_npis New NPI data frame with potential activation of incidence- + dependent NPIs. + @param df_infec_rki Case data for incidence comparison. + @param countyID CountyID of county to be validated. + @@param npiCode NPI Code of code to be validated. + @param start_npi_cols Index of column where NPI information start in + original data frame. + @param npi_incid_start Minimum incidence for activation of NPI codes. + @param start_date_validation Start date for validation. + @param end_date_validation End date for validation. """ if fine_resolution == 1: @@ -101,6 +112,7 @@ def validate(df_npis_old, df_npis, df_infec_rki, countyID, npiCode, def print_manual_download(filename, url): + """! Print download message to ask the user manually download a file. """ print( 'This script needs manual downloading of files. Please register' @@ -109,7 +121,18 @@ def print_manual_download(filename, url): def read_files(directory, fine_resolution): - """Downloads files and stores data in dataframes named df_npis_old, df_npis_desc""" + """! Reads files from local directory and returns data in dataframes + + @param directory Directory where data is loaded from. + @param fine_resolution 2 [Default] or 0 or 1. Defines which categories + are considered. + If '2' is set, all the subcategories (~1200) are considered. + If '1' is set, all incidence levels of subcategories are merged and + ~200 NPIs are considered. + If '0' is chosen only the main, summarizing categories (~20) are used. + @return Data frames df_npis_old (Decreed, encoded NPIs for all German + counties) and df_npis_desc (Description of NPIs) + """ if fine_resolution > 0: try: df_npis_old = pd.read_csv( @@ -167,20 +190,37 @@ def read_files(directory, fine_resolution): def activate_npis_based_on_incidence(local_incid, npi_lifting_delay, npi_activation_delay, incid_threshold): - """Warning: delay = 0 does NOT work!!!""" - # NPI can only be activated or liftet the day AFTER - # incidence is below/over threshold for N days. The - # incidence on day N only effects the NPI on day N+1 and - # NOT ON day N. Therefore we shift the incidence one day forward - # to match the indices of our dataframe df_local_new so that - # the NPIs can be calculated on the respective day. + """! + Warning: delay = 0 does NOT work!!! + Computes an activation vector according to a given incidence threshold, + observed incidence and activation or lifting delays. + + NPI can only be activated or lifted the day AFTER + incidence is below/over threshold for N days. The + incidence on day N only effects the NPI on day N+1 and + NOT ON day N. Therefore we shift the incidence one day forward + to match the indices of our dataframe df_local_new so that + the NPIs can be calculated on the respective day. + + Please also note that the first column will always returned as false + so the dataframe should not start with dates where NPIs are implemented. + For the Corona Datenplattform frame which starts from 2020-03-01 + this is no problem for the first days as there were no NPIs. + @param local_incid Incidence observed in local region. + @param npi_lifting_delay Delay in number of days when to lift a NPI if + threshold is no longer exceeded. + @param npi_activation_delay Delay in number of days when to implement a + NPI if threshold is exceeded. + @param incid_threshold Threshold to be considered. + """ # # Example (threshold=3.5): - # local_incid=pd.Series([4,2,4,2,2,4,4,2,4,2,2,2,2]) + # local_incid=pd.Series([2,4,2,4,2,2,4,4,2,4,2,2,2,2]) # Yesterdays incidence is over the threshold on following days: - # [?,1,0,1,0,0,1,1,0,1,0,0,0] - # The first day is not known and always set to the first days value. - # [1,1,0,1,0,0,1,1,0,1,0,0,0] + # [?,0,1,0,1,0,0,1,1,0,1,0,0,0] + # Yesterday for first day is not known. Thus, first day's boolean + # is always set to the same boolean as second day's boolean. + # [0,0,1,0,1,0,0,1,1,0,1,0,0,0] # First get a Series with 0 for yesterdays incidence # is below threshold and 1 for incidence over threshold @@ -192,19 +232,21 @@ def activate_npis_based_on_incidence(local_incid, npi_lifting_delay, npi_activat # Similarly, if incidence is below threshold for # 1+npi_lifting_delay days, the NPI is lifted. # - # Example: + # Example (continued): # With yesterdays incidence over threshold on days: - # [0,1,0,1,0,0,1,1,0,1,0,0,0] + # [0,0,1,0,1,0,0,1,1,0,1,0,0,0] # npi_lifting_delay=2, npi_activation_delay=1 - # NPI should be activated on day 8 and lifted on day 13 + # NPI should be activated on day 9 and lifted on day 14 # int_active should then be: - # [0,0,0,0,0,0,0,1,1,1,1,1,0] + # [0,0,0,0,0,0,0,0,1,1,1,1,1,0] # - # With yesterdays incidence over threshold on days: - # [1,1,0,1,0,0,1,1,0,1,0,0,0] (as above) - # NPI should be activated on day 2 and lifted on day 13 + # Another example: + # With yesterday's incidence over threshold on days: + # [1,1,0,1,0,0,1,1,0,1,0,0,0,0] + # npi_lifting_delay=3, npi_activation_delay=1 + # NPI should be activated on day 2 and lifted on day 14 # int_active should then be: - # [0,1,1,1,1,1,1,1,1,1,1,1,0] + # [0,1,1,1,1,1,1,1,1,1,1,1,1,0] # get a zero filled Series with same length to be # filled with ones where NPI is active @@ -212,19 +254,34 @@ def activate_npis_based_on_incidence(local_incid, npi_lifting_delay, npi_activat # loop over every day for i in range(len(yesterdays_incid_over_threshold)): # Set int_active=0 where last npi_lifting_delay+1 days are 0 - if yesterdays_incid_over_threshold[i-npi_lifting_delay:i+1].values.sum() == 0: + if yesterdays_incid_over_threshold[max(0,i-npi_lifting_delay):i+1].values.sum() == 0: int_active[i] = 0 # Set int_active=1 where last npi_activation_delay+1 days are all 1 - elif yesterdays_incid_over_threshold[i-npi_activation_delay:i+1].values.sum() == npi_activation_delay+1: + elif yesterdays_incid_over_threshold[max(0,i-npi_activation_delay):i+1].values.sum() == npi_activation_delay+1: int_active[i] = 1 # If no condition applies, set int_active to the value of the previous day - else: + elif i>0: # if int_active[i] = int_active[i-1] return int_active def drop_codes_and_categories(npi_codes_prior, npi_codes_prior_desc, df_npis_old, fine_resolution): + """! Drops codes and categories from original data frame if they are not + used. + + @param npi_codes_prior NPI codes read from description sheet. + @param npi_codes_prior_desc NPI code descriptions read from description sheet. + @param df_npis_old Original data frame with encoding (decreed, yes or no) + for different NPIs + @param fine_resolution 2 [Default] or 0 or 1. Defines which categories + are considered. + If '2' is set, all the subcategories (~1200) are considered. + If '1' is set, all incidence levels of subcategories are merged and + ~200 NPIs are considered. + If '0' is chosen only the main, summarizing categories (~20) are used. + @return Returns dropped codes, prior codes and reduced original data frame. + """ if fine_resolution > 0: # correct M04_N codes to M_04_M_N, N in {1,...,5}, M in {120,110,100,130,140} # (M04_1, i.e. i=1, has been corrected in original file but not for i>1) @@ -296,10 +353,19 @@ def get_npi_data(fine_resolution=2, counties_considered=geoger.get_county_ids() ): """! Loads a certain resolution of recorded NPI data from - the Corona Datenplattform and transforms it according to the - arguments given. + the Corona Datenplattform and extracts the counties asked for and + activates the NPIs if they are incidence dependent. + + Results' data frames will be stored in the directory as: + -fine_resolution=2: germany_counties_npi_subcat + -fine_resolution=1: germany_counties_npi_subcat_incgrouped + -fine_resolution=0: germany_counties_npi_maincat - For full functionality, please manually download + Needs the files 'cases_all_county_all_dates_repdate.json' and + 'county_current_population.json' which can be created by the functions + getCasesData.py (with argument --rep-date) and getPopulationData.py. + + Please manually download - kr_massnahmen_unterkategorien.csv - datensatzbeschreibung_massnahmen.xlsx from https://www.corona-datenplattform.de/dataset/massnahmen_unterkategorien_kreise @@ -322,12 +388,13 @@ def get_npi_data(fine_resolution=2, of stored data frames. @param end_date [Default = '', taken from read data] End date of stored data frames. - @param make_plot False [Default] or True. Defines if plots are - generated with matplotlib. - @param moving_average 0 [Default] or Number>0. Defines the number of - days for which a centered moving average is computed. + @param counties_considered [Default: 'All']. Either 'All' or a list of + county IDs from 1001 to 16xxx. """ + if counties_considered == 'All': + counties_considered = geoger.get_county_ids() + directory = out_folder directory = os.path.join(directory, 'Germany/') gd.check_dir(directory) From 7f0eff855f73d4d3a3a602d29ab82ecdc2aa56fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=BChn?= Date: Wed, 11 Jan 2023 15:52:10 +0100 Subject: [PATCH 043/104] some corrections and test --- .../memilio/epidata/getNPIData.py | 117 ++++++++++-------- .../epidata_test/test_epidata_getNPIData.py | 87 ++++++++++--- 2 files changed, 132 insertions(+), 72 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 09446b7f11..82b2b6657f 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -189,78 +189,85 @@ def read_files(directory, fine_resolution): return df_npis_old, df_npis_desc -def activate_npis_based_on_incidence(local_incid, npi_lifting_delay, npi_activation_delay, incid_threshold): +def activate_npis_based_on_incidence(local_incid, npi_lifting_days_threshold, npi_activation_days_threshold, incid_threshold): """! - Warning: delay = 0 does NOT work!!! Computes an activation vector according to a given incidence threshold, observed incidence and activation or lifting delays. - NPI can only be activated or lifted the day AFTER - incidence is below/over threshold for N days. The - incidence on day N only effects the NPI on day N+1 and - NOT ON day N. Therefore we shift the incidence one day forward - to match the indices of our dataframe df_local_new so that - the NPIs can be calculated on the respective day. + In order for incidence-dependent NPIs to become active, the incidence + has to exceed the threshold for npi_activation_days_threshold many days. + For a formerly active NPI to be lifted, the incidence has to be below + the threshold for npi_lifting_days_threshold many days. + + If one of the former cases holds true, then the activation or lifting + happens the day following the satisfaction of the criterion. This is in + accordance with case reporting that can only happen after the day has + finished and as these reports generally appeared in the morning, the NPI + can not directly be activated or lifted that same day. Please see the + examples for a better understanding. + + Example (Threshold=3.5): + local_incid=pd.Series([2, 4, 2, 4, 2, 2, 4, 4, 2, 4, 2, 2, 2, 2]) + Yesterdays incidence is over the threshold on following days: + [?, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0] + + Yesterday for first day is not known. Thus, first day's boolean + is always set to the same boolean as second day's boolean. + [0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0] + + With yesterdays incidence over threshold on days: + [0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0] + With npi_lifting_days_threshold=2, npi_activation_days_threshold=1 + NPI should be activated on days 4 and 9 and lifted on days 8 and 14 + int_active should then be: + [0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0] + With npi_lifting_days_threshold=3, npi_activation_days_threshold=2 + NPI should be activated on day 9 (and lifted on day 15; not in the vector) + [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] + + Another example: + With yesterday's incidence over threshold on days: + [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0] + npi_lifting_days_threshold=3, npi_activation_days_threshold=1 + NPI should be activated on day 2 and lifted on day 14 + int_active should then be: + [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0] Please also note that the first column will always returned as false so the dataframe should not start with dates where NPIs are implemented. For the Corona Datenplattform frame which starts from 2020-03-01 this is no problem for the first days as there were no NPIs. @param local_incid Incidence observed in local region. - @param npi_lifting_delay Delay in number of days when to lift a NPI if - threshold is no longer exceeded. - @param npi_activation_delay Delay in number of days when to implement a - NPI if threshold is exceeded. + @param npi_lifting_days_threshold Number of days for the incidence to be + lower than threshold to lift the NPI. + @param npi_activation_days_threshold Number of days the incidence threshold + is exceeded before activation of the NPI. @param incid_threshold Threshold to be considered. """ - # - # Example (threshold=3.5): - # local_incid=pd.Series([2,4,2,4,2,2,4,4,2,4,2,2,2,2]) - # Yesterdays incidence is over the threshold on following days: - # [?,0,1,0,1,0,0,1,1,0,1,0,0,0] - # Yesterday for first day is not known. Thus, first day's boolean - # is always set to the same boolean as second day's boolean. - # [0,0,1,0,1,0,0,1,1,0,1,0,0,0] - + + if npi_lifting_days_threshold < 1 or npi_activation_days_threshold < 1: + raise ValueError('Activation or lifting day variables need to be 1 or larger') + # First get a Series with 0 for yesterdays incidence # is below threshold and 1 for incidence over threshold yesterdays_incid_over_threshold = (local_incid.shift( - 1).fillna(local_incid[0]) > incid_threshold).astype(int) # maybe >= - - # If incidence is above threshold for - # 1+npi_activation_delay days, the NPI gets activated. - # Similarly, if incidence is below threshold for - # 1+npi_lifting_delay days, the NPI is lifted. - # - # Example (continued): - # With yesterdays incidence over threshold on days: - # [0,0,1,0,1,0,0,1,1,0,1,0,0,0] - # npi_lifting_delay=2, npi_activation_delay=1 - # NPI should be activated on day 9 and lifted on day 14 - # int_active should then be: - # [0,0,0,0,0,0,0,0,1,1,1,1,1,0] - # - # Another example: - # With yesterday's incidence over threshold on days: - # [1,1,0,1,0,0,1,1,0,1,0,0,0,0] - # npi_lifting_delay=3, npi_activation_delay=1 - # NPI should be activated on day 2 and lifted on day 14 - # int_active should then be: - # [0,1,1,1,1,1,1,1,1,1,1,1,1,0] + 1).fillna(local_incid[0]) > incid_threshold).astype(int) # get a zero filled Series with same length to be # filled with ones where NPI is active int_active = pd.Series(np.zeros(len(local_incid), dtype=int)) # loop over every day for i in range(len(yesterdays_incid_over_threshold)): - # Set int_active=0 where last npi_lifting_delay+1 days are 0 - if yesterdays_incid_over_threshold[max(0,i-npi_lifting_delay):i+1].values.sum() == 0: + # Set int_active=0 where last npi_lifting_days_threshold+1 days did not exceed + # the threshold + if yesterdays_incid_over_threshold[max(0,i-npi_lifting_days_threshold):i].values.sum() == 0: int_active[i] = 0 - # Set int_active=1 where last npi_activation_delay+1 days are all 1 - elif yesterdays_incid_over_threshold[max(0,i-npi_activation_delay):i+1].values.sum() == npi_activation_delay+1: + # Set int_active=1 where last npi_activation_days_threshold+1 days did + # all exceed the threshold + elif yesterdays_incid_over_threshold[max(0,i-npi_activation_days_threshold):i].values.sum() == npi_activation_days_threshold: int_active[i] = 1 # If no condition applies, set int_active to the value of the previous day - elif i>0: # if + elif i>0: # for i=0, int_active always will be zero (see comment above) int_active[i] = int_active[i-1] return int_active @@ -403,17 +410,17 @@ def get_npi_data(fine_resolution=2, # defines delay in number of days between exceeding # incidence threshold and NPI getting active # delay = 0 means only one day is considered (=no delay) - npi_activation_delay = 3 - npi_lifting_delay = 5 + npi_activation_days_threshold = 3 + npi_lifting_days_threshold = 5 # depending on the federal state and time period, there are # huge deviations of the lifting and activation delay which was usually # between 1 and 14 days - # we use npi_lifting_delay = 5 and npi_activation_delay = 3 + # we use npi_lifting_days_threshold = 5 and npi_activation_days_threshold = 3 # as this is the most common and has at some point been used in almost every county print('Using a delay of NPI activation of ' + - str(npi_activation_delay) + ' days.') + str(npi_activation_days_threshold) + ' days.') print('Using a delay of NPI lifting of ' + - str(npi_lifting_delay) + ' days.') + str(npi_lifting_days_threshold) + ' days.') # read manual downloaded files from directory df_npis_old, df_npis_desc = read_files(directory, fine_resolution) @@ -823,7 +830,7 @@ def get_npi_data(fine_resolution=2, local_incid = df_infec_local['Incidence'].copy() # get days where npis are active as int (1/0) - int_active = activate_npis_based_on_incidence(local_incid, npi_lifting_delay, npi_activation_delay, level[0]) + int_active = activate_npis_based_on_incidence(local_incid, npi_lifting_days_threshold, npi_activation_days_threshold, level[0]) # multiply rows of data frame by either 1 if threshold # passed (i.e., mentioned NPI is active) or zero @@ -934,7 +941,7 @@ def get_npi_data(fine_resolution=2, pass #### start validation #### - if fine_resolution == 2 and (npi_activation_delay + npi_lifting_delay == 0): + if fine_resolution == 2 and (npi_activation_days_threshold + npi_lifting_days_threshold == 0): start_date_validation = datetime(2020, 3, 1) end_date_validation = datetime(2022, 2, 15) diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py index cb01c0eefd..7a9e9f7aed 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py @@ -21,6 +21,7 @@ from pyfakefs import fake_filesystem_unittest import pandas as pd +import numpy as np from memilio.epidata import getNPIData as gnd from memilio.epidata import defaultDict as dd @@ -31,15 +32,24 @@ class TestGetNPIData(fake_filesystem_unittest.TestCase): path = '/home/NPIData' + threshold = 1.5 incid = pd.Series( [1, 2, 3, 4, 1, 1, 0, 5, 0, 1, 2, 3, 5, 1, 0, 2, 4, 5, 6]) - active = pd.Series( - [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1]) + active_11 = pd.Series( + [0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1]) + active_32 = pd.Series( + [0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]) + active_35 = pd.Series( + [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) incid_start_above_threshold = pd.Series( [2, 2, 3, 4, 1, 1, 0, 5, 0, 1, 2, 3, 5, 1, 0, 2, 4, 5, 6]) - active_start_above_threshold = pd.Series( - [0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1]) + active_start_above_threshold_11 = pd.Series( + [0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1]) + active_start_above_threshold_32 = pd.Series( + [0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]) + active_start_above_threshold_35 = pd.Series( + [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) codes_to_correct = [ # should be renamed: 'M04_2', 'M04_2', 'M04_2', 'M04_2', 'M04_2', @@ -98,26 +108,52 @@ def setUp(self): self.setUpPyfakefs() def test_activate_npis_based_on_threshold(self): - threshold = 1.5 - # test with delay = 1 ; int should be active two days after incid > 1.5 - # should start with 0 since first value of incid_delay_1 = 1 < 1.5 - npi_activation_delay = 1 - npi_lifting_delay = 1 + + # test with delay = 1; should be active two days after incid > 1.5 + npi_activation_days = 1 + npi_lifting_days = 1 int_active = gnd.activate_npis_based_on_incidence( - self.incid, npi_lifting_delay, npi_activation_delay, threshold) + self.incid, npi_lifting_days, npi_activation_days, self.threshold) self.assertEqual( int_active.to_list(), - self.active.to_list()) + self.active_11.to_list()) # test same data set with first value of incid_delay_1 = 2 > 1.5 int_active_start_above_threshold = gnd.activate_npis_based_on_incidence( - self.incid_start_above_threshold, npi_lifting_delay, - npi_activation_delay, threshold) + self.incid_start_above_threshold, npi_lifting_days, + npi_activation_days, self.threshold) self.assertEqual( int_active_start_above_threshold.to_list(), - self.active_start_above_threshold.to_list()) + self.active_start_above_threshold_11.to_list()) - # TODO maybe test bigger data sets and npi_activation_delay != npi_lifting_delay but both >= 3 + # # tests with day values larger 1 + npi_activation_days = 3 + npi_lifting_days = 2 + int_active = gnd.activate_npis_based_on_incidence( + self.incid, npi_lifting_days, npi_activation_days, self.threshold) + self.assertEqual( + int_active.to_list(), + self.active_32.to_list()) + int_active_start_above_threshold = gnd.activate_npis_based_on_incidence( + self.incid_start_above_threshold, npi_lifting_days, + npi_activation_days, self.threshold) + self.assertEqual( + int_active_start_above_threshold.to_list(), + self.active_start_above_threshold_32.to_list()) + + npi_activation_days = 3 + npi_lifting_days = 5 + int_active = gnd.activate_npis_based_on_incidence( + self.incid, npi_lifting_days, npi_activation_days, self.threshold) + self.assertEqual( + int_active.to_list(), + self.active_35.to_list()) + int_active_start_above_threshold = gnd.activate_npis_based_on_incidence( + self.incid_start_above_threshold, npi_lifting_days, + npi_activation_days, self.threshold) + self.assertEqual( + int_active_start_above_threshold.to_list(), + self.active_start_above_threshold_35.to_list()) def test_drop_codes_and_categories(self): # test with no dropped codes or categories @@ -178,7 +214,7 @@ def test_drop_codes_and_categories(self): codes_dropped, npi_codes_prior, df_npis_old = gnd.drop_codes_and_categories( npi_codes_prior_test_mc.copy(), npi_codes_prior_desc_test_mc.copy(), df_npis_old_test.copy(), fine_resolution) - # fine_resolution = 1 should handle missing codes and + # fine_resolution = 1 should handle missing codes fine_resolution=1 codes_dropped, npi_codes_prior, df_npis_old = gnd.drop_codes_and_categories( npi_codes_prior_test_mc.copy(), npi_codes_prior_desc_test_mc.copy(), df_npis_old_test.copy(), fine_resolution) @@ -192,7 +228,24 @@ def test_drop_codes_and_categories(self): # dataframe should not have changed pd.testing.assert_frame_equal(df_npis_old, df_npis_old_test) - # TODO test Platzhalter + + # create dataframe with Platzhalter categories + npi_codes_prior_test = pd.Series(self.corrected_codes[0:12]) + df_npis_old_test = pd.DataFrame( + columns=[dd.EngEng['npiCode']], + data=self.corrected_codes[0:12]) + # for this test no categories should be removed + # no 'Platzhalter' in the description + npi_codes_prior_desc_test = pd.Series( + ['-' for i in range(len(self.corrected_codes[0:12]))]) + npi_codes_prior_desc_test[0:12:2] = 'Platzhalter' + # work with copies to not change the original data + codes_dropped, npi_codes_prior, df_npis_old = gnd.drop_codes_and_categories( + npi_codes_prior_test.copy(), npi_codes_prior_desc_test.copy(), df_npis_old_test.copy(), fine_resolution) + # no codes should be dropped + self.assertEqual(codes_dropped, np.sort(self.corrected_codes[0:12:2]).tolist()) + # codes should now be corrected + self.assertEqual(df_npis_old[dd.EngEng['npiCode']].tolist(), self.corrected_codes[1:12:2]) if __name__ == '__main__': From ceec58cfc31e3413f1688c816a9c1bdd79952092 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=BChn?= Date: Sun, 15 Jan 2023 21:27:37 +0100 Subject: [PATCH 044/104] write csv --- .../epidata/getDataIntoPandasDataFrame.py | 9 ++++++--- .../memilio/epidata/getNPIData.py | 6 +++++- ...test_epidata_getDataIntoPandasDataFrame.py | 19 +++++++++++++------ 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getDataIntoPandasDataFrame.py b/pycode/memilio-epidata/memilio/epidata/getDataIntoPandasDataFrame.py index ba0d1d6b4a..74c6494ff3 100644 --- a/pycode/memilio-epidata/memilio/epidata/getDataIntoPandasDataFrame.py +++ b/pycode/memilio-epidata/memilio/epidata/getDataIntoPandasDataFrame.py @@ -292,8 +292,8 @@ def write_dataframe(df, directory, file_prefix, file_type): - json - json_timeasstring [Default] - hdf5 + - csv The file_type defines the file format and thus also the file ending. - The file format can be json or hdf5. For this option the column Date is converted from datetime to string. @param df pandas dataframe (pandas DataFrame) @@ -305,7 +305,8 @@ def write_dataframe(df, directory, file_prefix, file_type): outForm = {'json': [".json", {"orient": "records"}], 'json_timeasstring': [".json", {"orient": "records"}], - 'hdf5': [".h5", {"key": "data"}]} + 'hdf5': [".h5", {"key": "data"}], + 'csv' : [".csv", {}]} try: outFormEnd = outForm[file_type][0] @@ -313,7 +314,7 @@ def write_dataframe(df, directory, file_prefix, file_type): except KeyError: raise ValueError( "Error: The file format: " + file_type + - " does not exist. Use json, json_timeasstring or hdf5.") + " does not exist. Use json, json_timeasstring, hdf5 or csv.") out_path = os.path.join(directory, file_prefix + outFormEnd) @@ -326,6 +327,8 @@ def write_dataframe(df, directory, file_prefix, file_type): df.to_json(out_path, **outFormSpec) elif file_type == "hdf5": df.to_hdf(out_path, **outFormSpec) + elif file_type == 'csv': + df.to_csv(out_path) print("Information: Data has been written to", out_path) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 82b2b6657f..4c99ff58d4 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -986,12 +986,16 @@ def get_npi_data(fine_resolution=2, filename = 'germany_counties_npi_maincat' gd.write_dataframe(df_npis, directory, filename, file_format) + return df_npis + def main(): """! Main program entry.""" # arg_dict = gd.cli("testing") - get_npi_data(fine_resolution=2) + df = get_npi_data(fine_resolution=2, file_format='csv') + + if __name__ == "__main__": diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getDataIntoPandasDataFrame.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getDataIntoPandasDataFrame.py index 349d58c2ad..ee4e0202b4 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getDataIntoPandasDataFrame.py +++ b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getDataIntoPandasDataFrame.py @@ -473,14 +473,21 @@ def test_write_dataframe(self): d = {'Date': [d1, d2], 'col2': ["d1", "d2"]} df = pd.DataFrame(data=d) - gd.write_dataframe(df, self.path, "test_json", 'json') + gd.write_dataframe(df, self.path, "test_csv", 'csv') - file = "test_json.json" + file0 = "test_csv.json" self.assertEqual(len(os.listdir(self.path)), 1) - self.assertEqual(os.listdir(self.path), [file]) + self.assertEqual(os.listdir(self.path), [file0]) + + gd.write_dataframe(df, self.path, "test_json", 'json') + + file1 = "test_json.json" - file_with_path = os.path.join(self.path, file) + self.assertEqual(len(os.listdir(self.path)), 2) + self.assertEqual(os.listdir(self.path), [file0, file1]) + + file_with_path = os.path.join(self.path, file1) f = open(file_with_path, "r") fread = f.read() self.assertEqual(fread, self.test_string_json) @@ -497,8 +504,8 @@ def test_write_dataframe(self): file2 = "test_json_timeasstring.json" - self.assertEqual(len(os.listdir(self.path)), 2) - self.assertEqual(os.listdir(self.path).sort(), [file, file2].sort()) + self.assertEqual(len(os.listdir(self.path)), 3) + self.assertEqual(os.listdir(self.path).sort(), [file0, file1, file2].sort()) file_with_path = os.path.join(self.path, file2) f = open(file_with_path, "r") From 274560107572bbf3c9869724bccddfcd5c8793e6 Mon Sep 17 00:00:00 2001 From: patricklnz Date: Mon, 23 Jan 2023 14:39:56 +0100 Subject: [PATCH 045/104] Add test for full functionallity with small test sets --- .../memilio/epidata/getNPIData.py | 83 +++++++------ .../test_data/TestSetNPIsCaseData.json | 1 + .../test_data/TestSetNPIsCombinations.json | 1 + .../test_data/TestSetNPIsDescription.json | 1 + .../test_data/TestSetNPIsPopulationData.json | 1 + .../test_data/TestSetNPIsUnterkategorien.json | 1 + .../epidata_test/test_epidata_getNPIData.py | 112 ++++++++++++++---- 7 files changed, 144 insertions(+), 56 deletions(-) create mode 100644 pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsCaseData.json create mode 100644 pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsCombinations.json create mode 100644 pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsDescription.json create mode 100644 pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsPopulationData.json create mode 100644 pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsUnterkategorien.json diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 4c99ff58d4..01b0bfa103 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -18,19 +18,15 @@ # limitations under the License. ############################################################################# from datetime import datetime, timedelta -import sys import time import os import pandas as pd import numpy as np -import matplotlib.pyplot as plt -from scipy.spatial.distance import pdist -from scipy.cluster import hierarchy from memilio.epidata import getDataIntoPandasDataFrame as gd from memilio.epidata import geoModificationGermany as geoger from memilio.epidata import defaultDict as dd -from memilio.epidata import customPlot +from memilio.epidata import getPopulationData as gpd def validate(df_npis_old, df_npis, df_infec_rki, countyID, npiCode, @@ -163,7 +159,7 @@ def read_files(directory, fine_resolution): directory, 'kr_massnahmen_oberkategorien.csv')) except FileNotFoundError: print_manual_download( - 'datensatzbeschreibung_massnahmen.xlsx', + 'kr_massnahmen_oberkategorien.csv', 'https://www.corona-datenplattform.de/dataset/massnahmen_oberkategorien_kreise') raise FileNotFoundError df_npis_old.rename(dd.GerEng, axis=1, inplace=True) @@ -185,8 +181,18 @@ def read_files(directory, fine_resolution): 'datensatzbeschreibung_massnahmen.xlsx', 'https://www.corona-datenplattform.de/dataset/massnahmen_unterkategorien_kreise') raise FileNotFoundError + + # download combinations of npis + try: + if fine_resolution > 0: + df_npis_combinations_pre = pd.read_excel( + os.path.join( + directory, 'combination_npis.xlsx'), engine = 'openpyxl') + except FileNotFoundError: + print('File not found.') + raise FileNotFoundError - return df_npis_old, df_npis_desc + return df_npis_old, df_npis_desc, df_npis_combinations_pre def activate_npis_based_on_incidence(local_incid, npi_lifting_days_threshold, npi_activation_days_threshold, incid_threshold): @@ -269,6 +275,7 @@ def activate_npis_based_on_incidence(local_incid, npi_lifting_days_threshold, np # If no condition applies, set int_active to the value of the previous day elif i>0: # for i=0, int_active always will be zero (see comment above) int_active[i] = int_active[i-1] + # elif i==0 int active is 0 return int_active @@ -357,7 +364,9 @@ def get_npi_data(fine_resolution=2, out_folder=dd.defaultDict['out_folder'], start_date=dd.defaultDict['start_date'], end_date=dd.defaultDict['end_date'], - counties_considered=geoger.get_county_ids() + counties_considered=geoger.get_county_ids(), + npi_activation_days_threshold = 3, + npi_lifting_days_threshold = 5 ): """! Loads a certain resolution of recorded NPI data from the Corona Datenplattform and extracts the counties asked for and @@ -397,8 +406,18 @@ def get_npi_data(fine_resolution=2, stored data frames. @param counties_considered [Default: 'All']. Either 'All' or a list of county IDs from 1001 to 16xxx. + @param npi_activation_days_threshold [Default: 5]. Defines days of + exceeding inidence threshold to activate NPIs. + @param npi_alifting_days_threshold [Default: 5]. Defines days of + falling below inidence threshold to lift NPIs. """ + # depending on the federal state and time period, there are + # huge deviations of the lifting and activation delay which was usually + # between 1 and 14 days + # we use npi_lifting_days_threshold = 5 and npi_activation_days_threshold = 3 as default + # as this is the most common and has at some point been used in almost every county + if counties_considered == 'All': counties_considered = geoger.get_county_ids() @@ -406,24 +425,8 @@ def get_npi_data(fine_resolution=2, directory = os.path.join(directory, 'Germany/') gd.check_dir(directory) - if fine_resolution > 0: - # defines delay in number of days between exceeding - # incidence threshold and NPI getting active - # delay = 0 means only one day is considered (=no delay) - npi_activation_days_threshold = 3 - npi_lifting_days_threshold = 5 - # depending on the federal state and time period, there are - # huge deviations of the lifting and activation delay which was usually - # between 1 and 14 days - # we use npi_lifting_days_threshold = 5 and npi_activation_days_threshold = 3 - # as this is the most common and has at some point been used in almost every county - print('Using a delay of NPI activation of ' + - str(npi_activation_days_threshold) + ' days.') - print('Using a delay of NPI lifting of ' + - str(npi_lifting_days_threshold) + ' days.') - # read manual downloaded files from directory - df_npis_old, df_npis_desc = read_files(directory, fine_resolution) + df_npis_old, df_npis_desc, df_npis_combinations_pre = read_files(directory, fine_resolution) # get existing codes that are used (in df_npis_old M22-M24 are empty) npi_codes_prior = df_npis_desc['Variablenname'] @@ -436,10 +439,6 @@ def get_npi_data(fine_resolution=2, # combined; only those of, e.g., M01a_010_3 and M01a_080_4 can exclude each # other if fine_resolution > 0: - df_npis_combinations_pre = pd.read_excel( - os.path.join( - directory, 'combination_npis.xlsx'), engine = 'openpyxl') - # rename essential columns and throw away others column_names = ['Unnamed: ' + str(i) for i in range(3, 19)] rename_columns = {column_names[i]: i for i in range(len(column_names))} @@ -530,7 +529,7 @@ def get_npi_data(fine_resolution=2, writer, sheet_name=npi_groups_combinations_unique[i]) del df_out if write_file: - writer.save() + writer.close() # correct differences in codes between data sheet and explanation sheet codes_dropped, npi_codes_prior, df_npis_old = drop_codes_and_categories( @@ -666,8 +665,12 @@ def get_npi_data(fine_resolution=2, counties_removed = df_npis_old[ ~df_npis_old[dd.EngEng['idCounty']].isin(counties_considered)][ dd.EngEng['idCounty']].unique() - if list(counties_removed) != [16056]: - raise gd.DataError('Error. Other counties than that of Eisenach were removed.') + if 16056 in counties_considered: + if list(counties_removed) != [16056]: + raise gd.DataError('Error. Other counties than that of Eisenach were removed.') + else: + if counties_removed.size > 0: + raise gd.DataError('Error. Other counties than that of Eisenach were removed.') # remove rows for Eisenach df_npis_old = df_npis_old[df_npis_old[dd.EngEng['idCounty']].isin( counties_considered)].reset_index(drop=True) @@ -705,8 +708,11 @@ def get_npi_data(fine_resolution=2, directory, 'cases_all_county_all_dates_repdate.json')) df_infec_rki[dd.EngEng['date']] = pd.to_datetime( df_infec_rki[dd.EngEng['date']]) - df_population = pd.read_json( - directory + "county_current_population.json") + try: + df_population = pd.read_json( + directory + "county_current_population.json") + except: + df_population=gpd.get_population_data() min_date.append( df_infec_rki[dd.EngEng['date']].min().to_pydatetime()) max_date.append( @@ -739,6 +745,7 @@ def get_npi_data(fine_resolution=2, # replace -99 ("not used anymore") by 0 ("not used") # replace 2,3,4,5 ("mentioned in ...") by 1 ("mentioned") df_npis_old.replace([-99, 2, 3, 4, 5], [0, 1, 1, 1, 1], inplace=True) + counterdb = 0 for countyID in counties_considered: cid = 0 @@ -762,6 +769,11 @@ def get_npi_data(fine_resolution=2, df_infec_local = df_infec_local[(df_infec_local[dd.EngEng['date']] >= start_date_new) & ( df_infec_local[dd.EngEng['date']] <= end_date_new)].reset_index() + + local_incid = df_infec_local['Incidence'].copy() + if local_incid[0]>0: + counterdb += 1 + # get county-local data frame start_time = time.perf_counter() df_local_old = df_npis_old[df_npis_old[dd.EngEng['idCounty']] @@ -827,7 +839,6 @@ def get_npi_data(fine_resolution=2, # is exceeded for level, npi_indices in incidence_thresholds_to_npis.items(): if level[0] >= 0: # level[0] = incidvalthrsh - local_incid = df_infec_local['Incidence'].copy() # get days where npis are active as int (1/0) int_active = activate_npis_based_on_incidence(local_incid, npi_lifting_days_threshold, npi_activation_days_threshold, level[0]) @@ -924,6 +935,8 @@ def get_npi_data(fine_resolution=2, str(len(counties_considered)) + '. Estimated time remaining: ' + str(int(time_remain / 60)) + ' min.') + if counterdb >= len(counties_considered)*0.05: + print('WARNING: DataFrame starts with incidence > 0, thus incidence dependent NPIs could not be activated correctly. Please consider a start date 1 or 2 weeks ahead of your analysis.') # print sub counters print('Sub task counters are: ') diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsCaseData.json b/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsCaseData.json new file mode 100644 index 0000000000..ef762b4a6b --- /dev/null +++ b/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsCaseData.json @@ -0,0 +1 @@ +[{"Date":1672963200000,"ID_County":1001,"Confirmed":11074,"Deaths":9,"Recovered":11014},{"Date":1673049600000,"ID_County":1001,"Confirmed":11082,"Deaths":9,"Recovered":11021},{"Date":1673136000000,"ID_County":1001,"Confirmed":11085,"Deaths":9,"Recovered":11029},{"Date":1673222400000,"ID_County":1001,"Confirmed":11085,"Deaths":9,"Recovered":11032},{"Date":1673308800000,"ID_County":1001,"Confirmed":11089,"Deaths":9,"Recovered":11033},{"Date":1673395200000,"ID_County":1001,"Confirmed":11094,"Deaths":9,"Recovered":11041},{"Date":1673481600000,"ID_County":1001,"Confirmed":11096,"Deaths":9,"Recovered":11044},{"Date":1673568000000,"ID_County":1001,"Confirmed":11102,"Deaths":9,"Recovered":11045},{"Date":1673654400000,"ID_County":1001,"Confirmed":11103,"Deaths":9,"Recovered":11053},{"Date":1673740800000,"ID_County":1001,"Confirmed":11108,"Deaths":9,"Recovered":11053},{"Date":1673827200000,"ID_County":1001,"Confirmed":11113,"Deaths":9,"Recovered":11059},{"Date":1673913600000,"ID_County":1001,"Confirmed":11114,"Deaths":9,"Recovered":11061},{"Date":1674000000000,"ID_County":1001,"Confirmed":11115,"Deaths":9,"Recovered":11066},{"Date":1674086400000,"ID_County":1001,"Confirmed":11117,"Deaths":9,"Recovered":11068},{"Date":1674172800000,"ID_County":1001,"Confirmed":11125,"Deaths":9,"Recovered":11070},{"Date":1674259200000,"ID_County":1001,"Confirmed":11145,"Deaths":9,"Recovered":11073},{"Date":1674345600000,"ID_County":1001,"Confirmed":11149,"Deaths":9,"Recovered":11088},{"Date":1674432000000,"ID_County":1001,"Confirmed":11159,"Deaths":9,"Recovered":11101},{"Date":1674518400000,"ID_County":1001,"Confirmed":11175,"Deaths":9,"Recovered":11109},{"Date":1674604800000,"ID_County":1001,"Confirmed":11183,"Deaths":10,"Recovered":11129},{"Date":1674691200000,"ID_County":1001,"Confirmed":11193,"Deaths":10,"Recovered":11132},{"Date":1674777600000,"ID_County":1001,"Confirmed":11196,"Deaths":10,"Recovered":11148},{"Date":1674864000000,"ID_County":1001,"Confirmed":11209,"Deaths":10,"Recovered":11151},{"Date":1674950400000,"ID_County":1001,"Confirmed":11226,"Deaths":10,"Recovered":11153},{"Date":1675036800000,"ID_County":1001,"Confirmed":11226,"Deaths":10,"Recovered":11169},{"Date":1675123200000,"ID_County":1001,"Confirmed":11229,"Deaths":10,"Recovered":11189}] \ No newline at end of file diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsCombinations.json b/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsCombinations.json new file mode 100644 index 0000000000..a76f59d32c --- /dev/null +++ b/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsCombinations.json @@ -0,0 +1 @@ +[{"Unnamed: 0":0,"Variablenname":"M1_1","Variable":"Beschr\u00e4nkung X","Unnamed: 3":"x","Unnamed: 4":null,"Unnamed: 5":null,"Unnamed: 6":null,"Unnamed: 7":null,"Unnamed: 8":null,"Unnamed: 9":null,"Unnamed: 10":null,"Unnamed: 11":null,"Unnamed: 12":null,"Unnamed: 13":null,"Unnamed: 14":null,"Unnamed: 15":null,"Unnamed: 16":null,"Unnamed: 17":null,"Unnamed: 18":null},{"Unnamed: 0":6,"Variablenname":"M1_2","Variable":"Beschr\u00e4nkung Y","Unnamed: 3":null,"Unnamed: 4":"x","Unnamed: 5":"x","Unnamed: 6":null,"Unnamed: 7":null,"Unnamed: 8":null,"Unnamed: 9":null,"Unnamed: 10":null,"Unnamed: 11":null,"Unnamed: 12":null,"Unnamed: 13":null,"Unnamed: 14":null,"Unnamed: 15":null,"Unnamed: 16":null,"Unnamed: 17":null,"Unnamed: 18":null},{"Unnamed: 0":12,"Variablenname":"M1_3","Variable":"Beschr\u00e4nkung Z","Unnamed: 3":null,"Unnamed: 4":"x","Unnamed: 5":"x","Unnamed: 6":null,"Unnamed: 7":null,"Unnamed: 8":null,"Unnamed: 9":null,"Unnamed: 10":null,"Unnamed: 11":null,"Unnamed: 12":null,"Unnamed: 13":null,"Unnamed: 14":null,"Unnamed: 15":null,"Unnamed: 16":null,"Unnamed: 17":null,"Unnamed: 18":null}] \ No newline at end of file diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsDescription.json b/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsDescription.json new file mode 100644 index 0000000000..92a4c4f895 --- /dev/null +++ b/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsDescription.json @@ -0,0 +1 @@ +[{"Variablenname":"M1_1","Variable":"Beschr\u00e4nkung X","Beschreibung":"Unabh\u00e4ngig von der Inzidenz; Beschr\u00e4nkung X","Auspr\u00e4gungen":"-","Quelle":"test set","Ebene":"Kreis","Periodizit\u00e4t":"tagesaktuell","Datenstand":"ab 19.01.2023","Datentyp":"numerisch"},{"Variablenname":"M1_1_1","Variable":"Beschr\u00e4nkung X","Beschreibung":"Ab 0 Neuinfekt. pro 100K; Beschr\u00e4nkung X","Auspr\u00e4gungen":"-","Quelle":"test set","Ebene":"Kreis","Periodizit\u00e4t":"tagesaktuell","Datenstand":"ab 19.01.2023","Datentyp":"numerisch"},{"Variablenname":"M1_1_2","Variable":"Beschr\u00e4nkung X","Beschreibung":"Ab 10 Neuinfekt. pro 100K; Beschr\u00e4nkung X","Auspr\u00e4gungen":"-","Quelle":"test set","Ebene":"Kreis","Periodizit\u00e4t":"tagesaktuell","Datenstand":"ab 19.01.2023","Datentyp":"numerisch"},{"Variablenname":"M1_1_3","Variable":"Beschr\u00e4nkung X","Beschreibung":"Ab 35 Neuinfekt. pro 100K; Beschr\u00e4nkung X","Auspr\u00e4gungen":"-","Quelle":"test set","Ebene":"Kreis","Periodizit\u00e4t":"tagesaktuell","Datenstand":"ab 19.01.2023","Datentyp":"numerisch"},{"Variablenname":"M1_1_4","Variable":"Beschr\u00e4nkung X","Beschreibung":"Ab 50 Neuinfekt. pro 100K; Beschr\u00e4nkung X","Auspr\u00e4gungen":"-","Quelle":"test set","Ebene":"Kreis","Periodizit\u00e4t":"tagesaktuell","Datenstand":"ab 19.01.2023","Datentyp":"numerisch"},{"Variablenname":"M1_1_5","Variable":"Beschr\u00e4nkung X","Beschreibung":"Ab 100 Neuinfekt. pro 100K; Beschr\u00e4nkung X","Auspr\u00e4gungen":"-","Quelle":"test set","Ebene":"Kreis","Periodizit\u00e4t":"tagesaktuell","Datenstand":"ab 19.01.2023","Datentyp":"numerisch"},{"Variablenname":"M1_2","Variable":"Beschr\u00e4nkung Y","Beschreibung":"Unabh\u00e4ngig von der Inzidenz; Beschr\u00e4nkung Y","Auspr\u00e4gungen":"-","Quelle":"test set","Ebene":"Kreis","Periodizit\u00e4t":"tagesaktuell","Datenstand":"ab 19.01.2023","Datentyp":"numerisch"},{"Variablenname":"M1_2_1","Variable":"Beschr\u00e4nkung Y","Beschreibung":"Ab 0 Neuinfekt. pro 100K; Beschr\u00e4nkung Y","Auspr\u00e4gungen":"-","Quelle":"test set","Ebene":"Kreis","Periodizit\u00e4t":"tagesaktuell","Datenstand":"ab 19.01.2023","Datentyp":"numerisch"},{"Variablenname":"M1_2_2","Variable":"Beschr\u00e4nkung Y","Beschreibung":"Ab 10 Neuinfekt. pro 100K; Beschr\u00e4nkung Y","Auspr\u00e4gungen":"-","Quelle":"test set","Ebene":"Kreis","Periodizit\u00e4t":"tagesaktuell","Datenstand":"ab 19.01.2023","Datentyp":"numerisch"},{"Variablenname":"M1_2_3","Variable":"Beschr\u00e4nkung Y","Beschreibung":"Ab 35 Neuinfekt. pro 100K; Beschr\u00e4nkung Y","Auspr\u00e4gungen":"-","Quelle":"test set","Ebene":"Kreis","Periodizit\u00e4t":"tagesaktuell","Datenstand":"ab 19.01.2023","Datentyp":"numerisch"},{"Variablenname":"M1_2_4","Variable":"Beschr\u00e4nkung Y","Beschreibung":"Ab 50 Neuinfekt. pro 100K; Beschr\u00e4nkung Y","Auspr\u00e4gungen":"-","Quelle":"test set","Ebene":"Kreis","Periodizit\u00e4t":"tagesaktuell","Datenstand":"ab 19.01.2023","Datentyp":"numerisch"},{"Variablenname":"M1_2_5","Variable":"Beschr\u00e4nkung Y","Beschreibung":"Ab 100 Neuinfekt. pro 100K; Beschr\u00e4nkung Y","Auspr\u00e4gungen":"-","Quelle":"test set","Ebene":"Kreis","Periodizit\u00e4t":"tagesaktuell","Datenstand":"ab 19.01.2023","Datentyp":"numerisch"},{"Variablenname":"M1_3","Variable":"Beschr\u00e4nkung Y","Beschreibung":"Unabh\u00e4ngig von der Inzidenz; Beschr\u00e4nkung Z","Auspr\u00e4gungen":"-","Quelle":"test set","Ebene":"Kreis","Periodizit\u00e4t":"tagesaktuell","Datenstand":"ab 19.01.2023","Datentyp":"numerisch"},{"Variablenname":"M1_3_1","Variable":"Beschr\u00e4nkung Z","Beschreibung":"Ab 0 Neuinfekt. pro 100K; Beschr\u00e4nkung Z","Auspr\u00e4gungen":"-","Quelle":"test set","Ebene":"Kreis","Periodizit\u00e4t":"tagesaktuell","Datenstand":"ab 19.01.2023","Datentyp":"numerisch"},{"Variablenname":"M1_3_2","Variable":"Beschr\u00e4nkung Z","Beschreibung":"Ab 10 Neuinfekt. pro 100K; Beschr\u00e4nkung Z","Auspr\u00e4gungen":"-","Quelle":"test set","Ebene":"Kreis","Periodizit\u00e4t":"tagesaktuell","Datenstand":"ab 19.01.2023","Datentyp":"numerisch"},{"Variablenname":"M1_3_3","Variable":"Beschr\u00e4nkung Z","Beschreibung":"Ab 35 Neuinfekt. pro 100K; Beschr\u00e4nkung Z","Auspr\u00e4gungen":"-","Quelle":"test set","Ebene":"Kreis","Periodizit\u00e4t":"tagesaktuell","Datenstand":"ab 19.01.2023","Datentyp":"numerisch"},{"Variablenname":"M1_3_4","Variable":"Beschr\u00e4nkung Z","Beschreibung":"Ab 50 Neuinfekt. pro 100K; Beschr\u00e4nkung Z","Auspr\u00e4gungen":"-","Quelle":"test set","Ebene":"Kreis","Periodizit\u00e4t":"tagesaktuell","Datenstand":"ab 19.01.2023","Datentyp":"numerisch"},{"Variablenname":"M1_3_5","Variable":"Beschr\u00e4nkung Z","Beschreibung":"Ab 100 Neuinfekt. pro 100K; Beschr\u00e4nkung Z","Auspr\u00e4gungen":"-","Quelle":"test set","Ebene":"Kreis","Periodizit\u00e4t":"tagesaktuell","Datenstand":"ab 19.01.2023","Datentyp":"numerisch"}] \ No newline at end of file diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsPopulationData.json b/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsPopulationData.json new file mode 100644 index 0000000000..6690b0591d --- /dev/null +++ b/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsPopulationData.json @@ -0,0 +1 @@ +[{"ID_County":1001,"Population":98663,"<3 years":9711,"3-5 years":8499,"6-14 years":5103,"15-17 years":10673,"18-24 years":9915,"25-29 years":11048,"30-39 years":2346,"40-49 years":12995,"50-64 years":11268,"65-74 years":3744,">74 years":13361}] \ No newline at end of file diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsUnterkategorien.json b/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsUnterkategorien.json new file mode 100644 index 0000000000..0e1127237a --- /dev/null +++ b/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsUnterkategorien.json @@ -0,0 +1 @@ +[{"_id":1,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":1,"d20230127":1,"d20230128":1,"d20230129":2,"d20230130":2,"d20230131":1},{"_id":2,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1_1","d20230119":-99,"d20230120":-99,"d20230121":-99,"d20230122":-99,"d20230123":-99,"d20230124":-99,"d20230125":-99,"d20230126":-99,"d20230127":-99,"d20230128":-99,"d20230129":-99,"d20230130":-99,"d20230131":-99},{"_id":3,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1_2","d20230119":-99,"d20230120":-99,"d20230121":-99,"d20230122":-99,"d20230123":-99,"d20230124":-99,"d20230125":-99,"d20230126":-99,"d20230127":-99,"d20230128":-99,"d20230129":-99,"d20230130":-99,"d20230131":-99},{"_id":4,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1_3","d20230119":-99,"d20230120":-99,"d20230121":-99,"d20230122":-99,"d20230123":-99,"d20230124":-99,"d20230125":-99,"d20230126":-99,"d20230127":-99,"d20230128":-99,"d20230129":-99,"d20230130":-99,"d20230131":-99},{"_id":5,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1_4","d20230119":-99,"d20230120":-99,"d20230121":-99,"d20230122":-99,"d20230123":-99,"d20230124":-99,"d20230125":-99,"d20230126":-99,"d20230127":-99,"d20230128":-99,"d20230129":-99,"d20230130":-99,"d20230131":-99},{"_id":6,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1_5","d20230119":-99,"d20230120":-99,"d20230121":-99,"d20230122":-99,"d20230123":-99,"d20230124":-99,"d20230125":-99,"d20230126":-99,"d20230127":-99,"d20230128":-99,"d20230129":-99,"d20230130":-99,"d20230131":-99},{"_id":7,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2","d20230119":1,"d20230120":1,"d20230121":2,"d20230122":2,"d20230123":2,"d20230124":1,"d20230125":1,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":8,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2_1","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":9,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2_2","d20230119":2,"d20230120":2,"d20230121":1,"d20230122":1,"d20230123":1,"d20230124":1,"d20230125":1,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":10,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2_3","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":11,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2_4","d20230119":3,"d20230120":2,"d20230121":1,"d20230122":1,"d20230123":1,"d20230124":1,"d20230125":1,"d20230126":1,"d20230127":3,"d20230128":3,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":12,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2_5","d20230119":4,"d20230120":2,"d20230121":1,"d20230122":1,"d20230123":1,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":13,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":14,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3_1","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":15,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3_2","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":4,"d20230128":4,"d20230129":4,"d20230130":4,"d20230131":4},{"_id":16,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3_3","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":17,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3_4","d20230119":0,"d20230120":0,"d20230121":1,"d20230122":1,"d20230123":1,"d20230124":1,"d20230125":1,"d20230126":1,"d20230127":3,"d20230128":3,"d20230129":3,"d20230130":3,"d20230131":3},{"_id":18,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3_5","d20230119":0,"d20230120":1,"d20230121":1,"d20230122":1,"d20230123":1,"d20230124":1,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0}] \ No newline at end of file diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py index 7a9e9f7aed..6863f01c50 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py @@ -19,19 +19,34 @@ ###################################################################### import unittest from pyfakefs import fake_filesystem_unittest +from unittest.mock import patch +import os import pandas as pd -import numpy as np +import numpy as np + +from datetime import date from memilio.epidata import getNPIData as gnd from memilio.epidata import defaultDict as dd from memilio.epidata import getDataIntoPandasDataFrame as gd + class TestGetNPIData(fake_filesystem_unittest.TestCase): maxDiff = None path = '/home/NPIData' + #load test data from test_data folder + here = os.path.dirname(os.path.abspath(__file__)) + df_npis_old = pd.read_json(os.path.join(here, 'test_data', 'TestSetNPIsUnterkategorien.json')) + df_npis_desc = pd.read_json(os.path.join(here, 'test_data', 'TestSetNPIsDescription.json')) + df_npis_combinations_pre = pd.read_json(os.path.join(here, 'test_data', 'TestSetNPIsCombinations.json')) + df_cases = pd.read_json(os.path.join(here, 'test_data', 'TestSetNPIsCaseData.json')) + df_pop = pd.read_json(os.path.join(here, 'test_data', 'TestSetNPIsPopulationData.json')) + + df_npis_old_renamed = df_npis_old.rename(dd.GerEng, axis=1, inplace=False) + threshold = 1.5 incid = pd.Series( [1, 2, 3, 4, 1, 1, 0, 5, 0, 1, 2, 3, 5, 1, 0, 2, 4, 5, 6]) @@ -94,21 +109,21 @@ class TestGetNPIData(fake_filesystem_unittest.TestCase): 'M01_100_4', 'M01_110_4', 'M01_120_4', 'M01_130_4', 'M01_140_4', 'M01_100_5', 'M01_110_5', 'M01_120_5', 'M01_130_5', 'M01_140_5', ] - codes_to_drop = ['M02_120', 'M02_120_1', 'M02_120_2', + codes_to_drop = ['M02_120', 'M02_120_1', 'M02_120_2', 'M02_120_3', 'M02_120_4', 'M02_120_5', - 'M07_100', 'M07_100_1', 'M07_100_2', + 'M07_100', 'M07_100_1', 'M07_100_2', 'M07_100_3', 'M07_100_4', 'M07_100_5'] missing_codes = ['M02_120_1', 'M02_120_2', 'M02_120_3', 'M02_120_4', 'M02_120_5', - 'M07_100_1', 'M07_100_2', + 'M07_100_1', 'M07_100_2', 'M07_100_3', 'M07_100_4', 'M07_100_5'] def setUp(self): self.setUpPyfakefs() def test_activate_npis_based_on_threshold(self): - + # test with delay = 1; should be active two days after incid > 1.5 npi_activation_days = 1 npi_lifting_days = 1 @@ -125,8 +140,8 @@ def test_activate_npis_based_on_threshold(self): self.assertEqual( int_active_start_above_threshold.to_list(), self.active_start_above_threshold_11.to_list()) - - # # tests with day values larger 1 + + # tests with day values larger 1 npi_activation_days = 3 npi_lifting_days = 2 int_active = gnd.activate_npis_based_on_incidence( @@ -198,7 +213,8 @@ def test_drop_codes_and_categories(self): # test handling of missing codes # more codes in npi_codes_prior than in df - npi_codes_prior_test_mc = pd.Series(self.codes_to_correct + self.missing_codes) + npi_codes_prior_test_mc = pd.Series( + self.codes_to_correct + self.missing_codes) # create dataframe with corrected columns # just one column, no data df_npis_old_test = pd.DataFrame( @@ -207,15 +223,17 @@ def test_drop_codes_and_categories(self): # for this test no categories should be removed # no 'Platzhalter' in the description npi_codes_prior_desc_test_mc = pd.Series( - ['-' for i in range(len(self.codes_to_correct+ self.missing_codes))]) - + ['-' + for i in range( + len(self.codes_to_correct + self.missing_codes))]) + # with fine_resolution = 2 every missing code should raise a DataError with self.assertRaises(gd.DataError): codes_dropped, npi_codes_prior, df_npis_old = gnd.drop_codes_and_categories( - npi_codes_prior_test_mc.copy(), npi_codes_prior_desc_test_mc.copy(), df_npis_old_test.copy(), fine_resolution) + npi_codes_prior_test_mc.copy(), npi_codes_prior_desc_test_mc.copy(), df_npis_old_test.copy(), fine_resolution) - # fine_resolution = 1 should handle missing codes - fine_resolution=1 + # fine_resolution = 1 should handle missing codes + fine_resolution = 1 codes_dropped, npi_codes_prior, df_npis_old = gnd.drop_codes_and_categories( npi_codes_prior_test_mc.copy(), npi_codes_prior_desc_test_mc.copy(), df_npis_old_test.copy(), fine_resolution) # no codes should be dropped @@ -228,24 +246,76 @@ def test_drop_codes_and_categories(self): # dataframe should not have changed pd.testing.assert_frame_equal(df_npis_old, df_npis_old_test) - # create dataframe with Platzhalter categories npi_codes_prior_test = pd.Series(self.corrected_codes[0:12]) df_npis_old_test = pd.DataFrame( columns=[dd.EngEng['npiCode']], data=self.corrected_codes[0:12]) - # for this test no categories should be removed - # no 'Platzhalter' in the description + # add 'Platzhalter' every second element npi_codes_prior_desc_test = pd.Series( - ['-' for i in range(len(self.corrected_codes[0:12]))]) + ['-' for i in range(0, 12)]) npi_codes_prior_desc_test[0:12:2] = 'Platzhalter' # work with copies to not change the original data codes_dropped, npi_codes_prior, df_npis_old = gnd.drop_codes_and_categories( npi_codes_prior_test.copy(), npi_codes_prior_desc_test.copy(), df_npis_old_test.copy(), fine_resolution) - # no codes should be dropped - self.assertEqual(codes_dropped, np.sort(self.corrected_codes[0:12:2]).tolist()) - # codes should now be corrected - self.assertEqual(df_npis_old[dd.EngEng['npiCode']].tolist(), self.corrected_codes[1:12:2]) + # check dropped codes + self.assertEqual(codes_dropped, np.sort( + self.corrected_codes[0:12:2]).tolist()) + # every second category dropped but naming should be the same + self.assertEqual( + df_npis_old[dd.EngEng['npiCode']].tolist(), + self.corrected_codes[1: 12: 2]) + + # test full function only for 13 days and one county + # use side effect to return different values to pd.read_json (first cases, then population) + + @patch('pandas.read_json', side_effect=[df_cases, df_pop]) + @patch('memilio.epidata.getNPIData.read_files', + return_value=[df_npis_old, df_npis_desc, df_npis_combinations_pre]) + @patch('memilio.epidata.getNPIData.drop_codes_and_categories', + return_value=[[], + df_npis_desc['Variablenname'], + df_npis_old_renamed]) + def test_get_npi_data(self, mock_codes, mock_read, mock_data): + # print 'Additional errors in consistent naming' is expected + # print 'WARNING: DataFrame starts with incidence > 0, thus incidence dependent NPIs could not be activated correctly.' is expected + npis_test = gnd.get_npi_data( + fine_resolution=2, out_folder=self.path, + counties_considered=[1001], + end_date=date(2024, 1, 1), + npi_activation_days_threshold=3, npi_lifting_days_threshold=2) + # test if mocks have been called + self.assertEqual(mock_data.call_count, 2) + self.assertEqual(mock_read.call_count, 1) + self.assertEqual(mock_codes.call_count, 1) + # some columns should be empty + # either because they're not mentioned or because the incidence is not exceeded. + self.assertEqual( + npis_test.iloc[:, [3, 4, 5, 6, 7, 9, 11, 14, 15, 17]].values.sum(), 0) + # incidence independent NPIs should not have changed + self.assertEqual( + npis_test.M1_1.to_list(), + [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) + self.assertEqual( + npis_test.M1_2.to_list(), + [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]) + self.assertEqual( + npis_test.M1_3.to_list(), + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + # test remaining_columns + # incidence is constantly > 20 + #M1_1_1,M1_2_1,M1_3_1,M1_1_2,M1_3_2 always 0 + self.assertEqual( + npis_test.M1_2_2.tolist(), + [0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]) + #_4 -> Incidence > 50 + self.assertEqual( + npis_test.M1_3_4.to_list(), + [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]) + self.assertEqual( + npis_test.M1_2_4.to_list(), + [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]) + #_5 is always 0 since incidence is never > 100 in this test if __name__ == '__main__': From 039dc113026c9bf218566379cac26423710b420f Mon Sep 17 00:00:00 2001 From: Anna Wendler Date: Mon, 30 Jan 2023 12:42:00 +0100 Subject: [PATCH 046/104] Fix merge conflicts --- .../memilio/epidata/getNPIData.py | 101 ++++++++++++++++-- 1 file changed, 94 insertions(+), 7 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 01b0bfa103..172b072128 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -131,15 +131,62 @@ def read_files(directory, fine_resolution): """ if fine_resolution > 0: try: - df_npis_old = pd.read_csv( - os.path.join(directory, 'kr_massnahmen_unterkategorien.csv'), - sep=',') # , nrows=1248) # 1248 for debugging, only reading Flensburg + + codelist = ['m01a', 'm01b', 'm02a', 'm02b', 'm03', 'm04', 'm05', 'm06', 'm07', 'm08', 'm09', + 'm10', 'm11', 'm12', 'm13', 'm14', 'm15', 'm16', 'm17', 'm18', 'm19', 'm20', 'm21'] + counter_codes = 0 + for code in codelist: + + df_npis_per_code = pd.read_csv( + os.path.join(directory, + 'kr_massn_unterkat_{}.csv'.format(code)), + sep=',') + + # set some parameters for dataframe + if counter_codes == 0: + counties = np.sort(df_npis_per_code.ags5.unique()) + num_counties = len(df_npis_per_code.ags5.unique()) + + # extract dates from data + dates = df_npis_per_code.iloc[:int(df_npis_per_code.shape[0]/num_counties), 5] + # rename dates so that they match dates from other npi dataframe + dates_new = ['d' + date.replace('-', '') for date in dates] + + df_local = [pd.DataFrame() for i in range(num_counties)] + print('.') + + # set df for all counties + for i in range(0,num_counties): + print(i) + if counter_codes == 0: + df_local[i] = pd.DataFrame(columns=list(df_npis_per_code.columns[0:5]) + ['code'] + dates_new) + + dummy_to_append = pd.DataFrame(columns=['code'] + dates_new, data=df_npis_per_code[df_npis_per_code.ags5 == counties[i]].iloc[:, 6:].T.reset_index().values.copy()) + + df_local[i] = pd.concat([df_local[i], dummy_to_append]) + + if df_npis_per_code.iloc[i*len(dates):(i+1)*len(dates),3].nunique() > 1: + raise gd.DataError('Dates are not sorted as expected.') + + # Set first five columns so that they match old format of data frame (from kr_massnahmen_unterkategorien.csv) + if counter_codes == len(codelist)-1: + df_local[i].iloc[:,0:5] = df_npis_per_code.iloc[i*len(dates),0:5].values + + print(df_local[i]) + + counter_codes += 1 + + except FileNotFoundError: print_manual_download( 'kr_massnahmen_unterkategorien.csv', 'https://www.corona-datenplattform.de/dataset/massnahmen_unterkategorien_kreise') raise FileNotFoundError + + df_npis_old = pd.concat([df_local[i] for i in range(num_counties)]) df_npis_old.rename(dd.GerEng, axis=1, inplace=True) + df_npis_old['NPI_code'] = df_npis_old['NPI_code'].str.replace('code_m', 'M') + print(df_npis_old) # check if rows hospitals and geriatric care are still empty; # these fields have been empty so far and are thus not used @@ -425,12 +472,40 @@ def get_npi_data(fine_resolution=2, directory = os.path.join(directory, 'Germany/') gd.check_dir(directory) +<<<<<<< Updated upstream +======= + if fine_resolution > 0: + # defines delay in number of days between exceeding + # incidence threshold and NPI getting active + # delay = 0 means only one day is considered (=no delay) + npi_activation_days_threshold = 3 + npi_lifting_days_threshold = 5 + # depending on the federal state and time period, there are + # huge deviations of the lifting and activation delay which was usually + # between 1 and 14 days + # we use npi_lifting_days_threshold = 5 and npi_activation_days_threshold = 3 + # as this is the most common and has at some point been used in almost every county + print('Using a threshold for NPI activation of ' + + str(npi_activation_days_threshold) + ' days.') + print('Using a threshold for NPI lifting of ' + + str(npi_lifting_days_threshold) + ' days.') + +>>>>>>> Stashed changes # read manual downloaded files from directory df_npis_old, df_npis_desc, df_npis_combinations_pre = read_files(directory, fine_resolution) - # get existing codes that are used (in df_npis_old M22-M24 are empty) - npi_codes_prior = df_npis_desc['Variablenname'] - npi_codes_prior_desc = df_npis_desc['Variable'] + # get existing codes that are used + # for fine resolution we don't have codes M22 - M24 but are still listed in description + if fine_resolution > 0: + # count how many codes contain M22, M23 or M24 + num_nonexistent_codes = df_npis_desc['Variablenname'].str.count("M22|M23|M24").sum() + # do not include these nonexistent codes + npi_codes_prior = df_npis_desc['Variablenname'].iloc[:-num_nonexistent_codes] + npi_codes_prior_desc = df_npis_desc['Variable'].iloc[:-num_nonexistent_codes] + # for fine_resolution = 0 df_npis_old M22-M24 are empty) + else: + npi_codes_prior = df_npis_desc['Variablenname'] + npi_codes_prior_desc = df_npis_desc['Variable'] # for fine_resolution > 0 deactivation of non-combinable # incidence-dependent NPIs has to be conducted; therefore we defined a @@ -439,6 +514,16 @@ def get_npi_data(fine_resolution=2, # combined; only those of, e.g., M01a_010_3 and M01a_080_4 can exclude each # other if fine_resolution > 0: +<<<<<<< Updated upstream +======= + df_npis_combinations_pre = pd.read_excel( + os.path.join( + directory, 'combination_npis.xlsx'), engine = 'openpyxl') + + + num_nonexistent_codes2 = df_npis_combinations_pre['Variablenname'].str.count("M22|M23|M24").sum() + df_npis_combinations_pre = df_npis_combinations_pre.iloc[:-num_nonexistent_codes2,:] +>>>>>>> Stashed changes # rename essential columns and throw away others column_names = ['Unnamed: ' + str(i) for i in range(3, 19)] rename_columns = {column_names[i]: i for i in range(len(column_names))} @@ -1006,7 +1091,9 @@ def main(): """! Main program entry.""" # arg_dict = gd.cli("testing") - df = get_npi_data(fine_resolution=2, file_format='csv') + df = get_npi_data(fine_resolution=1, file_format='csv') + + #df = read_files(directory, fine_resolution) From 169d5b4c86a6c310bd2c68f9175e5825ce2335cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Mon, 30 Jan 2023 16:24:10 +0100 Subject: [PATCH 047/104] correction of tests --- .../memilio/epidata/getDataIntoPandasDataFrame.py | 2 +- .../epidata_test/test_epidata_getDataIntoPandasDataFrame.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getDataIntoPandasDataFrame.py b/pycode/memilio-epidata/memilio/epidata/getDataIntoPandasDataFrame.py index 74c6494ff3..5e4dfe3558 100644 --- a/pycode/memilio-epidata/memilio/epidata/getDataIntoPandasDataFrame.py +++ b/pycode/memilio-epidata/memilio/epidata/getDataIntoPandasDataFrame.py @@ -314,7 +314,7 @@ def write_dataframe(df, directory, file_prefix, file_type): except KeyError: raise ValueError( "Error: The file format: " + file_type + - " does not exist. Use json, json_timeasstring, hdf5 or csv.") + " does not exist. Use json, json_timeasstring, csv, or hdf5.") out_path = os.path.join(directory, file_prefix + outFormEnd) diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getDataIntoPandasDataFrame.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getDataIntoPandasDataFrame.py index ee4e0202b4..e1e95f2b23 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getDataIntoPandasDataFrame.py +++ b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getDataIntoPandasDataFrame.py @@ -475,7 +475,7 @@ def test_write_dataframe(self): gd.write_dataframe(df, self.path, "test_csv", 'csv') - file0 = "test_csv.json" + file0 = "test_csv.csv" self.assertEqual(len(os.listdir(self.path)), 1) self.assertEqual(os.listdir(self.path), [file0]) @@ -534,7 +534,7 @@ def test_write_dataframe_error(self): gd.write_dataframe(df, self.path, "test_json", 'wrong') error_message = "Error: The file format: " + 'wrong' + \ - " does not exist. Use json, json_timeasstring or hdf5." + " does not exist. Use json, json_timeasstring, csv, or hdf5." self.assertEqual(str(error.exception), error_message) @patch('memilio.epidata.getDIVIData.get_divi_data') From b3f7eaf513698961c481adebdced21ebeade67e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Mon, 30 Jan 2023 17:16:02 +0100 Subject: [PATCH 048/104] Correct for wrongly computed incidence if cases data starts with value x, now starting with 0 and editing some comments. --- .../memilio/epidata/getNPIData.py | 46 ++++++++++++------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 01b0bfa103..fa26dcdab8 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -406,17 +406,16 @@ def get_npi_data(fine_resolution=2, stored data frames. @param counties_considered [Default: 'All']. Either 'All' or a list of county IDs from 1001 to 16xxx. - @param npi_activation_days_threshold [Default: 5]. Defines days of - exceeding inidence threshold to activate NPIs. - @param npi_alifting_days_threshold [Default: 5]. Defines days of - falling below inidence threshold to lift NPIs. + @param npi_activation_days_threshold [Default: 3]. Defines necessary number + of days exceeding case incidence threshold to activate NPIs. + @param npi_alifting_days_threshold [Default: 5]. Defines necessary number + of days below case incidence threshold threshold to lift NPIs. """ - # depending on the federal state and time period, there are - # huge deviations of the lifting and activation delay which was usually - # between 1 and 14 days - # we use npi_lifting_days_threshold = 5 and npi_activation_days_threshold = 3 as default - # as this is the most common and has at some point been used in almost every county + # Depending on the federal state and time period, there are + # huge differences for number of days before the lifting and activation. + # It was usually between 1 and 14 days. We use npi_lifting_days_threshold = 5 + # and npi_activation_days_threshold = 3 as default averaged value. if counties_considered == 'All': counties_considered = geoger.get_county_ids() @@ -745,7 +744,7 @@ def get_npi_data(fine_resolution=2, # replace -99 ("not used anymore") by 0 ("not used") # replace 2,3,4,5 ("mentioned in ...") by 1 ("mentioned") df_npis_old.replace([-99, 2, 3, 4, 5], [0, 1, 1, 1, 1], inplace=True) - counterdb = 0 + counter_cases_start = 0 for countyID in counties_considered: cid = 0 @@ -757,9 +756,17 @@ def get_npi_data(fine_resolution=2, ) pop_local = df_population.loc[df_population[dd.EngEng['idCounty']] == countyID, dd.EngEng['population']].values[0] + # consider difference between current day and day-7 to compute incidence - df_infec_local['Incidence'] = df_infec_local[dd.EngEng['confirmed']].diff( - periods=7).fillna(df_infec_local[dd.EngEng['confirmed']]) / pop_local * 100000 + # As a helper, repeat first entry seven times, incidence then always starts with 0. + cases_first_value = df_infec_local[dd.EngEng['confirmed']].values[0] + df_infec_local_repeat_first_entry = [ + cases_first_value for i in range(7)] + list( + df_infec_local[dd.EngEng['confirmed']].values.transpose()) + + df_infec_local['Incidence'] = (pd.Series( + df_infec_local_repeat_first_entry).diff(periods=7) / + pop_local * 100000)[7:].values # set to main data frame df_infec_rki.loc[df_infec_rki[dd.EngEng['idCounty']] == @@ -771,8 +778,10 @@ def get_npi_data(fine_resolution=2, local_incid = df_infec_local['Incidence'].copy() - if local_incid[0]>0: - counterdb += 1 + # Count counties with start cases >= 1: + # In this case NPI activation cannot be ensured to work as expected + if cases_first_value >= 1: + counter_cases_start += 1 # get county-local data frame start_time = time.perf_counter() @@ -935,8 +944,13 @@ def get_npi_data(fine_resolution=2, str(len(counties_considered)) + '. Estimated time remaining: ' + str(int(time_remain / 60)) + ' min.') - if counterdb >= len(counties_considered)*0.05: - print('WARNING: DataFrame starts with incidence > 0, thus incidence dependent NPIs could not be activated correctly. Please consider a start date 1 or 2 weeks ahead of your analysis.') + if counter_cases_start >= len(counties_considered)*0.05: + print('WARNING: DataFrame starts with reported cases > 0 ' + 'for more than 5 percent of the counties to be considered. ' + 'In this case, incidence computation and activation of ' + 'incidence-dependent NPIs cannot be ensured to work correctly. ' + 'Please consider a start date of some weeks ahead of the ' + 'time window to be analyzed for NPI\'s effects.') # print sub counters print('Sub task counters are: ') From e507e405bc9d06b8c92bbcf4697036f525e6be02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=BChn?= Date: Tue, 31 Jan 2023 10:00:16 +0100 Subject: [PATCH 049/104] change test comments --- .../memilio/epidata_test/test_epidata_getNPIData.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py index 6863f01c50..d35f775e08 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py @@ -266,9 +266,9 @@ def test_drop_codes_and_categories(self): df_npis_old[dd.EngEng['npiCode']].tolist(), self.corrected_codes[1: 12: 2]) - # test full function only for 13 days and one county - # use side effect to return different values to pd.read_json (first cases, then population) - + # Test full functionality only for 13 days and one county. + # Use side_effect to return first case data, then population data with 1st + # and 2nd call to pd.read_json(). @patch('pandas.read_json', side_effect=[df_cases, df_pop]) @patch('memilio.epidata.getNPIData.read_files', return_value=[df_npis_old, df_npis_desc, df_npis_combinations_pre]) @@ -277,8 +277,8 @@ def test_drop_codes_and_categories(self): df_npis_desc['Variablenname'], df_npis_old_renamed]) def test_get_npi_data(self, mock_codes, mock_read, mock_data): - # print 'Additional errors in consistent naming' is expected - # print 'WARNING: DataFrame starts with incidence > 0, thus incidence dependent NPIs could not be activated correctly.' is expected + # print 'Additional errors in consistent naming' is expected. + # print 'WARNING: DataFrame starts with reported cases > 0 for more than 5 percent...' is expected. npis_test = gnd.get_npi_data( fine_resolution=2, out_folder=self.path, counties_considered=[1001], From 6508fbc6681c3ee34ac7c6c501948439de713422 Mon Sep 17 00:00:00 2001 From: Anna Wendler Date: Tue, 31 Jan 2023 10:16:42 +0100 Subject: [PATCH 050/104] Read new data for subcategories and compare with old data --- .../memilio/epidata/compareNPIData.py | 230 +++++------------- .../memilio/epidata/getNPIData.py | 42 +--- 2 files changed, 67 insertions(+), 205 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/compareNPIData.py b/pycode/memilio-epidata/memilio/epidata/compareNPIData.py index fc6f1d04a6..c9436e92d8 100644 --- a/pycode/memilio-epidata/memilio/epidata/compareNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/compareNPIData.py @@ -3,195 +3,89 @@ import pandas as pd import numpy as np -# directory = '/home/wend_aa/Documents/PSS/NPIs' -directory = 'c:\\work\\projets\\epidemiology\\code\\memilio\\data/pydata\\Germany/' +from memilio.epidata import getDataIntoPandasDataFrame as gd +from memilio.epidata import defaultDict as dd -#numberofcities = 2 +directory = '/home/wend_aa/memilio/data/pydata/Germany' -df_npis_old = pd.read_csv( +############################################################################################################# +# read old data for subcategories + +df_npis_old_data = pd.read_csv( os.path.join(directory, 'kr_massnahmen_unterkategorien.csv'), sep=',') # , nrows=numberofcities*1248 -print(df_npis_old) -numberofcities = int(len(df_npis_old) / 1248) -print('Number of cities', numberofcities) - -codelist = ['m01a', 'm01b', 'm02a', 'm02b', 'm03', 'm04', 'm05', 'm06', 'm07', 'm08', 'm09', - 'm10', 'm11', 'm12', 'm13', 'm14', 'm15', 'm16', 'm17', 'm18', 'm19', 'm20', 'm21'] - -#list = ['m01a', 'm01b'] - -# df_npis_new = pd.read_csv( -# os.path.join(directory, 'new', 'kr_massn_unterkat_m01a.csv'), -# sep=',', nrows=4000) - -# print(df_npis_old) -# print(pd.testing.assert_frame_equal(df_npis_old, df_npis_new)) -# print(pd.testing.assert_frame_equal(df_npis_old, df_npis_new)) +df_npis_old_data.rename(dd.GerEng, axis=1, inplace=True) +############################################################################################################# +# read new data for subcategories -# number of days that we have data for in new data set -numberofdays = 883 - -# create data frame that contains structure of old data -df = df_npis_old.iloc[:, :6] - -start_county = 1 - -df_local = [pd.DataFrame() for i in range(401)] -counter_col = 0 +codelist = ['m01a', 'm01b', 'm02a', 'm02b', 'm03', 'm04', 'm05', 'm06', 'm07', 'm08', 'm09', + 'm10', 'm11', 'm12', 'm13', 'm14', 'm15', 'm16', 'm17', 'm18', 'm19', 'm20', 'm21'] +counter_codes = 0 for code in codelist: - # print(code) - df_npis_new = pd.read_csv( - os.path.join(directory, 'new', + print(code) + df_npis_per_code = pd.read_csv( + os.path.join(directory, 'kr_massn_unterkat_{}.csv'.format(code)), - sep=',') # , skiprows=1248 - # print(df_npis_new.iloc[0:numberofdays]) - #df_npis_new = df_npis_new.iloc[1152:] - #print('df_new', df_npis_new) - counties = np.sort(df_npis_new.ags5.unique()) - if len(df_npis_new) / len(counties) != numberofdays: - print('error') - if len(counties) != 401: - print('error') - - # extract dates from new data - dates = df_npis_new.iloc[:numberofdays, 5] - # rename dates so that they match dates from old npi dataframe - dates_new = ['d' + date.replace('-', '') for date in dates] - - if (counter_col > 0) and ((old_cols != df_npis_new.columns[0:5]).any()): - print('error') - - for i in range(0,401): - if counter_col == 0: - df_local[i] = pd.DataFrame(columns=list(df_npis_new.columns[0:5]) + ['code'] + dates_new) - - dummy_to_append = pd.DataFrame(columns=['code'] + dates_new, data=df_npis_new[df_npis_new.ags5 == counties[i]].iloc[:, 6:].T.reset_index().values.copy()) + sep=',') - df_local[i] = pd.concat([df_local[i], dummy_to_append]) + # set some parameters for dataframe + if counter_codes == 0: + counties = np.sort(df_npis_per_code.ags5.unique()) + num_counties = len(df_npis_per_code.ags5.unique()) - old_cols = df_npis_new.columns[0:5] + # extract dates from data + dates = df_npis_per_code.iloc[:int(df_npis_per_code.shape[0]/num_counties), 5] + # rename dates so that they match dates from other npi dataframe + dates_new = ['d' + date.replace('-', '') for date in dates] - counter_col += 1 - -# set names for all rows of county -for i in range(0,401): - print('county ' + str(i)) - df_local[i][old_cols] = df_npis_new[df_npis_new.ags5 == counties[i]].iloc[0,0:5] - pd.testing.assert_frame_equal(df_npis_old[df_npis_old.ags5==counties[i]].iloc[:1152,6:].reset_index(drop=True), df_local[i].iloc[:,6:723].reset_index(drop=True), check_dtype=False) + df_local = [pd.DataFrame() for i in range(num_counties)] + + # set df for all counties + for i in range(0,num_counties): + print(i) + if counter_codes == 0: + df_local[i] = pd.DataFrame(columns=list(df_npis_per_code.columns[0:5]) + ['code'] + dates_new) -print(df) + dummy_to_append = pd.DataFrame(columns=['code'] + dates_new, data=df_npis_per_code[df_npis_per_code.ags5 == counties[i]].iloc[:, 6:].T.reset_index().values.copy()) -df_dropped = df -df_npis_old_dropped = df_npis_old -df_lost_subcat = df_npis_old - - -for i in range(numberofcities): - # drop entries for subcategories 22, 23, and 24 because they are not identified for new data set - #print((i+1)*counter, (i+1)*1248) - # df_dropped = df_dropped.drop( - # df_dropped.index[range((i+1)*counter, i*counter + 1248)]) - df_dropped = df_dropped.drop( - df_dropped.index[range((i+1)*counter_col, i*counter_col + 1248)]) - #print('df_dropped in loop', df_dropped.iloc[:(i+1)*counter + 3]) - # df_npis_old_dropped = df_npis_old_dropped.drop(df_npis_old_dropped.index[range( - # (i+1)*counter, i*counter + 1248)]) - df_npis_old_dropped = df_npis_old_dropped.drop(df_npis_old_dropped.index[range( - (i+1)*counter_col, i*counter_col + 1248)]) - - # extract all dropped rows from df_npis_old - # print('indices', (i)*1248, (i+1)*counter) - df_lost_subcat = df_lost_subcat.drop( - df_lost_subcat.index[range(i*(1248-counter_col), i*(1248-counter_col) + counter_col)]) - # print('df_dropped in loop lost subcat', - # df_lost_subcat.iloc[:(i)*(1248-counter) + 3]) - -#print('df', df.iloc[:1248-counter]) -df_dropped = df_dropped.iloc[:, :723] -print('df_dropped', df_dropped) -print('df_npis_old_dropped', df_npis_old_dropped) - - -#df = df.iloc[:counter, :723] - -#df_npis_old = df_npis_old[:counter] -#print(df_npis_old_dropped.iloc[2*counter:], df_dropped.iloc[2*counter:]) -# print('Differences between old and new data:', pd.testing.assert_frame_equal( -# df_npis_old_dropped.iloc[2*counter:, :], df_dropped.iloc[2*counter:, :], check_dtype=False)) -print('df_npis_old_dropped columns', df_npis_old_dropped.columns) -print('df_dropped columns', df_dropped.columns) - -# Check differences columnwise -nodiffcolumns = [] -for i in range(df_npis_old_dropped.shape[1]): - try: - if pd.testing.assert_frame_equal(pd.DataFrame(df_npis_old_dropped.iloc[:, i]), pd.DataFrame(df_dropped.iloc[:, i]), check_dtype=False) == None: - nodiffcolumns.append(df_npis_old_dropped.columns[i]) - #print('No difference in column', df_npis_old_dropped.columns[i]) - - except AssertionError as e: - print(e, "\n") + df_local[i] = pd.concat([df_local[i], dummy_to_append]) + + if df_npis_per_code.iloc[i*len(dates):(i+1)*len(dates),3].nunique() > 1: + raise gd.DataError('Dates are not sorted as expected.') + # Set first five columns so that they match old format of data frame (from kr_massnahmen_unterkategorien.csv) + if counter_codes == len(codelist)-1: + df_local[i].iloc[:,0:5] = df_npis_per_code.iloc[i*len(dates),0:5].values -# Check differences rowwise -nodiffrows = [] -for i in range(df_npis_old_dropped.shape[0]): - try: - if pd.testing.assert_frame_equal(pd.DataFrame(df_npis_old_dropped.iloc[i]), pd.DataFrame(df_dropped.iloc[i]), check_dtype=False) == None: - nodiffrows.append( - df_npis_old_dropped.iloc[i, :6]) - #print('No difference in column', df_npis_old_dropped.columns[i]) + counter_codes += 1 - except AssertionError as e: - print(e, "\n") +df_npis_new_data = pd.concat([df_local[i] for i in range(num_counties)]) +df_npis_new_data.rename(dd.GerEng, axis=1, inplace=True) +df_npis_new_data['NPI_code'] = df_npis_new_data['NPI_code'].str.replace('code_m', 'M') -# Check differences citywise -nodiffcities = [] -for i in range(numberofcities): - try: - if pd.testing.assert_frame_equal(pd.DataFrame(df_npis_old_dropped.iloc[i*counter_col: (i+1)*counter_col, :]), pd.DataFrame(df_dropped.iloc[i*counter_col: (i+1)*counter_col, :]), check_dtype=False) == None: - nodiffcities.append(df_npis_old_dropped.iloc[i*counter_col]['kreis']) - #print('No difference in column', df_npis_old_dropped.columns[i]) - except AssertionError as e: - print(e, "\n") +############################################################################################################# +# compare dataframes +# check if all rows for code M22, M23 and M24 in df_npis_old_data are empty +codesnotused = ((df_npis_old_data[df_npis_old_data["NPI_code"].str.contains("M22|M23|M24")].iloc[:,6:] == -99).all() == True).all() +if codesnotused == True: + print("Codes M22, M23 and M24 are not used in old data (as expected).") +else: + print("Something wrong with data.") -# check if all values in dropped subcategories are = -99 -print('df_lostsubcat', df_lost_subcat.iloc[:, :]) -print(np.sum(np.where(df_lost_subcat.iloc[:, 6:] != -99))) -columnwisecheck = (df_lost_subcat.iloc[:, 6:] == -99).all() -checkallcolumns = (columnwisecheck == True).all() -print('Dropped subcategories have never been active:', checkallcolumns) +# remove rows for codes M22, M23 and M24 from df_npis_old_data +df_npis_old_data = df_npis_old_data[~df_npis_old_data["NPI_code"].str.contains("M22|M23|M24")].copy() +# check how many days are covered in each dataframe and adjust accordingly so that both dataframes have same size +# we already know that df_npis_new_data has more columns than df_npis_old_data +df_npis_new_data = df_npis_new_data.iloc[:, :len(df_npis_old_data.columns)] -# print(nodiffcolumns) -print('Number of no diff columns', len(nodiffcolumns)) -print('Total number of columns', df_npis_old_dropped.shape[1]) +# assert if frames are equal (except index and column '_id') - -#print('df_npis_old_dropped columns', df_npis_old_dropped.columns) -#print('df_dropped columns', df_dropped.columns) -print('Column names are equal', - (df_npis_old_dropped.columns == df_dropped.columns).all()) - - -# print(nodiffrows) -print('Number of no diff rows', len(nodiffrows)) -print('Total number of rows', df_npis_old_dropped.shape[0]) - - -print(nodiffcities) -print('Number of no diff cities', len(nodiffcities)) -print('Total number of cities', numberofcities) - - -# save results in csv file -pd.DataFrame(nodiffcolumns).to_csv(os.path.join( - directory, 'comparedata_columns.csv')) # , nodiffrows, nodiffcities -pd.DataFrame(nodiffrows).to_csv( - os.path.join(directory, 'comparedata_rows.csv')) -pd.DataFrame(nodiffcities).to_csv( - os.path.join(directory, 'comparedata_cities.csv')) +if (pd.testing.assert_frame_equal(df_npis_old_data.iloc[:,1:].reset_index(drop=True), df_npis_new_data.iloc[:,1:].reset_index(drop=True), check_dtype = False) == None): + print('Data frames are equal.') +else: + print('Data frames are not equal.') \ No newline at end of file diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 172b072128..a557479bad 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -48,7 +48,7 @@ def validate(df_npis_old, df_npis, df_infec_rki, countyID, npiCode, @param end_date_validation End date for validation. """ - if fine_resolution == 1: + if fine_resolution == 2: npiCodes = [npiCode + code for code in [''] + ['_' + str(i) for i in range(1, 6)]] else: @@ -136,7 +136,6 @@ def read_files(directory, fine_resolution): 'm10', 'm11', 'm12', 'm13', 'm14', 'm15', 'm16', 'm17', 'm18', 'm19', 'm20', 'm21'] counter_codes = 0 for code in codelist: - df_npis_per_code = pd.read_csv( os.path.join(directory, 'kr_massn_unterkat_{}.csv'.format(code)), @@ -153,11 +152,9 @@ def read_files(directory, fine_resolution): dates_new = ['d' + date.replace('-', '') for date in dates] df_local = [pd.DataFrame() for i in range(num_counties)] - print('.') # set df for all counties for i in range(0,num_counties): - print(i) if counter_codes == 0: df_local[i] = pd.DataFrame(columns=list(df_npis_per_code.columns[0:5]) + ['code'] + dates_new) @@ -172,8 +169,6 @@ def read_files(directory, fine_resolution): if counter_codes == len(codelist)-1: df_local[i].iloc[:,0:5] = df_npis_per_code.iloc[i*len(dates),0:5].values - print(df_local[i]) - counter_codes += 1 @@ -186,7 +181,6 @@ def read_files(directory, fine_resolution): df_npis_old = pd.concat([df_local[i] for i in range(num_counties)]) df_npis_old.rename(dd.GerEng, axis=1, inplace=True) df_npis_old['NPI_code'] = df_npis_old['NPI_code'].str.replace('code_m', 'M') - print(df_npis_old) # check if rows hospitals and geriatric care are still empty; # these fields have been empty so far and are thus not used @@ -472,25 +466,6 @@ def get_npi_data(fine_resolution=2, directory = os.path.join(directory, 'Germany/') gd.check_dir(directory) -<<<<<<< Updated upstream -======= - if fine_resolution > 0: - # defines delay in number of days between exceeding - # incidence threshold and NPI getting active - # delay = 0 means only one day is considered (=no delay) - npi_activation_days_threshold = 3 - npi_lifting_days_threshold = 5 - # depending on the federal state and time period, there are - # huge deviations of the lifting and activation delay which was usually - # between 1 and 14 days - # we use npi_lifting_days_threshold = 5 and npi_activation_days_threshold = 3 - # as this is the most common and has at some point been used in almost every county - print('Using a threshold for NPI activation of ' + - str(npi_activation_days_threshold) + ' days.') - print('Using a threshold for NPI lifting of ' + - str(npi_lifting_days_threshold) + ' days.') - ->>>>>>> Stashed changes # read manual downloaded files from directory df_npis_old, df_npis_desc, df_npis_combinations_pre = read_files(directory, fine_resolution) @@ -514,16 +489,13 @@ def get_npi_data(fine_resolution=2, # combined; only those of, e.g., M01a_010_3 and M01a_080_4 can exclude each # other if fine_resolution > 0: -<<<<<<< Updated upstream -======= df_npis_combinations_pre = pd.read_excel( os.path.join( directory, 'combination_npis.xlsx'), engine = 'openpyxl') - num_nonexistent_codes2 = df_npis_combinations_pre['Variablenname'].str.count("M22|M23|M24").sum() - df_npis_combinations_pre = df_npis_combinations_pre.iloc[:-num_nonexistent_codes2,:] ->>>>>>> Stashed changes + num_nonexistent_codes_pre = df_npis_combinations_pre['Variablenname'].str.count("M22|M23|M24").sum() + df_npis_combinations_pre = df_npis_combinations_pre.iloc[:-num_nonexistent_codes_pre,:] # rename essential columns and throw away others column_names = ['Unnamed: ' + str(i) for i in range(3, 19)] rename_columns = {column_names[i]: i for i in range(len(column_names))} @@ -754,7 +726,7 @@ def get_npi_data(fine_resolution=2, if list(counties_removed) != [16056]: raise gd.DataError('Error. Other counties than that of Eisenach were removed.') else: - if counties_removed.size > 0: + if counties_removed.size > 1: raise gd.DataError('Error. Other counties than that of Eisenach were removed.') # remove rows for Eisenach df_npis_old = df_npis_old[df_npis_old[dd.EngEng['idCounty']].isin( @@ -1091,11 +1063,7 @@ def main(): """! Main program entry.""" # arg_dict = gd.cli("testing") - df = get_npi_data(fine_resolution=1, file_format='csv') - - #df = read_files(directory, fine_resolution) - - + df = get_npi_data(fine_resolution=2, file_format='csv') if __name__ == "__main__": From 61d69d4c14218801205a6c416c3897f9a83812e5 Mon Sep 17 00:00:00 2001 From: patricklnz Date: Tue, 31 Jan 2023 12:13:43 +0100 Subject: [PATCH 051/104] correct tests --- .../memilio/epidata/getNPIData.py | 103 ++++++++++-------- .../test_data/TestSetNPIsUnterkategorien.json | 2 +- .../epidata_test/test_epidata_getNPIData.py | 2 +- 3 files changed, 57 insertions(+), 50 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 3bb709995d..d089b0c1cd 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -131,56 +131,60 @@ def read_files(directory, fine_resolution): """ if fine_resolution > 0: try: - - codelist = ['m01a', 'm01b', 'm02a', 'm02b', 'm03', 'm04', 'm05', 'm06', 'm07', 'm08', 'm09', - 'm10', 'm11', 'm12', 'm13', 'm14', 'm15', 'm16', 'm17', 'm18', 'm19', 'm20', 'm21'] - counter_codes = 0 - for code in codelist: - df_npis_per_code = pd.read_csv( - os.path.join(directory, - 'kr_massn_unterkat_{}.csv'.format(code)), - sep=',') - - # set some parameters for dataframe - if counter_codes == 0: - counties = np.sort(df_npis_per_code.ags5.unique()) - num_counties = len(df_npis_per_code.ags5.unique()) - - # extract dates from data - dates = df_npis_per_code.iloc[:int(df_npis_per_code.shape[0]/num_counties), 5] - # rename dates so that they match dates from other npi dataframe - dates_new = ['d' + date.replace('-', '') for date in dates] - - df_local = [pd.DataFrame() for i in range(num_counties)] - - # set df for all counties - for i in range(0,num_counties): + try: + codelist = ['m01a', 'm01b', 'm02a', 'm02b', 'm03', 'm04', 'm05', 'm06', 'm07', 'm08', 'm09', + 'm10', 'm11', 'm12', 'm13', 'm14', 'm15', 'm16', 'm17', 'm18', 'm19', 'm20', 'm21'] + counter_codes = 0 + for code in codelist: + df_npis_per_code = pd.read_csv( + os.path.join(directory, + 'kr_massn_unterkat_{}.csv'.format(code)), + sep=',') + + # set some parameters for dataframe if counter_codes == 0: - df_local[i] = pd.DataFrame(columns=list(df_npis_per_code.columns[0:5]) + ['code'] + dates_new) + counties = np.sort(df_npis_per_code.ags5.unique()) + num_counties = len(df_npis_per_code.ags5.unique()) - dummy_to_append = pd.DataFrame(columns=['code'] + dates_new, data=df_npis_per_code[df_npis_per_code.ags5 == counties[i]].iloc[:, 6:].T.reset_index().values.copy()) + # extract dates from data + dates = df_npis_per_code.iloc[:int(df_npis_per_code.shape[0]/num_counties), 5] + # rename dates so that they match dates from other npi dataframe + dates_new = ['d' + date.replace('-', '') for date in dates] - df_local[i] = pd.concat([df_local[i], dummy_to_append]) - - if df_npis_per_code.iloc[i*len(dates):(i+1)*len(dates),3].nunique() > 1: - raise gd.DataError('Dates are not sorted as expected.') - - # Set first five columns so that they match old format of data frame (from kr_massnahmen_unterkategorien.csv) - if counter_codes == len(codelist)-1: - df_local[i].iloc[:,0:5] = df_npis_per_code.iloc[i*len(dates),0:5].values + df_local = [pd.DataFrame() for i in range(num_counties)] + + # set df for all counties + for i in range(0,num_counties): + if counter_codes == 0: + df_local[i] = pd.DataFrame(columns=list(df_npis_per_code.columns[0:5]) + ['code'] + dates_new) - counter_codes += 1 - + dummy_to_append = pd.DataFrame(columns=['code'] + dates_new, data=df_npis_per_code[df_npis_per_code.ags5 == counties[i]].iloc[:, 6:].T.reset_index().values.copy()) + df_local[i] = pd.concat([df_local[i], dummy_to_append]) + + if df_npis_per_code.iloc[i*len(dates):(i+1)*len(dates),3].nunique() > 1: + raise gd.DataError('Dates are not sorted as expected.') + + # Set first five columns so that they match old format of data frame (from kr_massnahmen_unterkategorien.csv) + if counter_codes == len(codelist)-1: + df_local[i].iloc[:,0:5] = df_npis_per_code.iloc[i*len(dates),0:5].values + + counter_codes += 1 + df_npis_old = pd.concat([df_local[i] for i in range(num_counties)]) + df_npis_old.rename(dd.GerEng, axis=1, inplace=True) + df_npis_old['NPI_code'] = df_npis_old['NPI_code'].str.replace('code_m', 'M') + except FileNotFoundError: + df_npis_old = pd.read_csv( + os.path.join( + directory, 'kr_massnahmen_unterkategorien.csv'), + sep=',') + df_npis_old.rename(dd.GerEng, axis=1, inplace=True) except FileNotFoundError: print_manual_download( 'kr_massnahmen_unterkategorien.csv', 'https://www.corona-datenplattform.de/dataset/massnahmen_unterkategorien_kreise') raise FileNotFoundError - df_npis_old = pd.concat([df_local[i] for i in range(num_counties)]) - df_npis_old.rename(dd.GerEng, axis=1, inplace=True) - df_npis_old['NPI_code'] = df_npis_old['NPI_code'].str.replace('code_m', 'M') # check if rows hospitals and geriatric care are still empty; # these fields have been empty so far and are thus not used @@ -472,10 +476,17 @@ def get_npi_data(fine_resolution=2, # for fine resolution we don't have codes M22 - M24 but are still listed in description if fine_resolution > 0: # count how many codes contain M22, M23 or M24 - num_nonexistent_codes = df_npis_desc['Variablenname'].str.count("M22|M23|M24").sum() + num_nonexistent_codes = df_npis_desc['Variablenname'].str.count( + "M22|M23|M24").sum() # do not include these nonexistent codes - npi_codes_prior = df_npis_desc['Variablenname'].iloc[:-num_nonexistent_codes] - npi_codes_prior_desc = df_npis_desc['Variable'].iloc[:-num_nonexistent_codes] + if num_nonexistent_codes != 0: + npi_codes_prior = df_npis_desc['Variablenname'].iloc[: - + num_nonexistent_codes] + npi_codes_prior_desc = df_npis_desc['Variable'].iloc[: - + num_nonexistent_codes] + else: + npi_codes_prior = df_npis_desc['Variablenname'] + npi_codes_prior_desc = df_npis_desc['Variable'] # for fine_resolution = 0 df_npis_old M22-M24 are empty) else: npi_codes_prior = df_npis_desc['Variablenname'] @@ -488,13 +499,9 @@ def get_npi_data(fine_resolution=2, # combined; only those of, e.g., M01a_010_3 and M01a_080_4 can exclude each # other if fine_resolution > 0: - df_npis_combinations_pre = pd.read_excel( - os.path.join( - directory, 'combination_npis.xlsx'), engine = 'openpyxl') - - num_nonexistent_codes_pre = df_npis_combinations_pre['Variablenname'].str.count("M22|M23|M24").sum() - df_npis_combinations_pre = df_npis_combinations_pre.iloc[:-num_nonexistent_codes_pre,:] + if num_nonexistent_codes_pre !=0: + df_npis_combinations_pre = df_npis_combinations_pre.iloc[:-num_nonexistent_codes_pre,:] # rename essential columns and throw away others column_names = ['Unnamed: ' + str(i) for i in range(3, 19)] rename_columns = {column_names[i]: i for i in range(len(column_names))} diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsUnterkategorien.json b/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsUnterkategorien.json index 0e1127237a..0576bfc6fb 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsUnterkategorien.json +++ b/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsUnterkategorien.json @@ -1 +1 @@ -[{"_id":1,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":1,"d20230127":1,"d20230128":1,"d20230129":2,"d20230130":2,"d20230131":1},{"_id":2,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1_1","d20230119":-99,"d20230120":-99,"d20230121":-99,"d20230122":-99,"d20230123":-99,"d20230124":-99,"d20230125":-99,"d20230126":-99,"d20230127":-99,"d20230128":-99,"d20230129":-99,"d20230130":-99,"d20230131":-99},{"_id":3,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1_2","d20230119":-99,"d20230120":-99,"d20230121":-99,"d20230122":-99,"d20230123":-99,"d20230124":-99,"d20230125":-99,"d20230126":-99,"d20230127":-99,"d20230128":-99,"d20230129":-99,"d20230130":-99,"d20230131":-99},{"_id":4,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1_3","d20230119":-99,"d20230120":-99,"d20230121":-99,"d20230122":-99,"d20230123":-99,"d20230124":-99,"d20230125":-99,"d20230126":-99,"d20230127":-99,"d20230128":-99,"d20230129":-99,"d20230130":-99,"d20230131":-99},{"_id":5,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1_4","d20230119":-99,"d20230120":-99,"d20230121":-99,"d20230122":-99,"d20230123":-99,"d20230124":-99,"d20230125":-99,"d20230126":-99,"d20230127":-99,"d20230128":-99,"d20230129":-99,"d20230130":-99,"d20230131":-99},{"_id":6,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1_5","d20230119":-99,"d20230120":-99,"d20230121":-99,"d20230122":-99,"d20230123":-99,"d20230124":-99,"d20230125":-99,"d20230126":-99,"d20230127":-99,"d20230128":-99,"d20230129":-99,"d20230130":-99,"d20230131":-99},{"_id":7,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2","d20230119":1,"d20230120":1,"d20230121":2,"d20230122":2,"d20230123":2,"d20230124":1,"d20230125":1,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":8,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2_1","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":9,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2_2","d20230119":2,"d20230120":2,"d20230121":1,"d20230122":1,"d20230123":1,"d20230124":1,"d20230125":1,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":10,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2_3","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":11,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2_4","d20230119":3,"d20230120":2,"d20230121":1,"d20230122":1,"d20230123":1,"d20230124":1,"d20230125":1,"d20230126":1,"d20230127":3,"d20230128":3,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":12,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2_5","d20230119":4,"d20230120":2,"d20230121":1,"d20230122":1,"d20230123":1,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":13,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":14,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3_1","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":15,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3_2","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":4,"d20230128":4,"d20230129":4,"d20230130":4,"d20230131":4},{"_id":16,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3_3","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":17,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3_4","d20230119":0,"d20230120":0,"d20230121":1,"d20230122":1,"d20230123":1,"d20230124":1,"d20230125":1,"d20230126":1,"d20230127":3,"d20230128":3,"d20230129":3,"d20230130":3,"d20230131":3},{"_id":18,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3_5","d20230119":0,"d20230120":1,"d20230121":1,"d20230122":1,"d20230123":1,"d20230124":1,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0}] \ No newline at end of file +[{"_id":1,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":1,"d20230127":1,"d20230128":1,"d20230129":2,"d20230130":2,"d20230131":1},{"_id":2,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1_1","d20230119":-99,"d20230120":-99,"d20230121":-99,"d20230122":-99,"d20230123":-99,"d20230124":-99,"d20230125":-99,"d20230126":-99,"d20230127":-99,"d20230128":-99,"d20230129":-99,"d20230130":-99,"d20230131":-99},{"_id":3,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1_2","d20230119":0,"d20230120":0,"d20230121":1,"d20230122":1,"d20230123":4,"d20230124":4,"d20230125":4,"d20230126":3,"d20230127":0,"d20230128":-99,"d20230129":-99,"d20230130":-99,"d20230131":-99},{"_id":4,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1_3","d20230119":-99,"d20230120":-99,"d20230121":-99,"d20230122":-99,"d20230123":-99,"d20230124":-99,"d20230125":-99,"d20230126":-99,"d20230127":-99,"d20230128":-99,"d20230129":-99,"d20230130":-99,"d20230131":-99},{"_id":5,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1_4","d20230119":-99,"d20230120":-99,"d20230121":-99,"d20230122":-99,"d20230123":-99,"d20230124":-99,"d20230125":-99,"d20230126":-99,"d20230127":-99,"d20230128":-99,"d20230129":-99,"d20230130":-99,"d20230131":-99},{"_id":6,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1_5","d20230119":-99,"d20230120":-99,"d20230121":-99,"d20230122":-99,"d20230123":-99,"d20230124":-99,"d20230125":-99,"d20230126":-99,"d20230127":-99,"d20230128":-99,"d20230129":-99,"d20230130":-99,"d20230131":-99},{"_id":7,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2","d20230119":1,"d20230120":1,"d20230121":2,"d20230122":2,"d20230123":2,"d20230124":1,"d20230125":1,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":8,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2_1","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":9,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2_2","d20230119":2,"d20230120":2,"d20230121":1,"d20230122":1,"d20230123":1,"d20230124":1,"d20230125":1,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":10,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2_3","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":11,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2_4","d20230119":3,"d20230120":2,"d20230121":1,"d20230122":1,"d20230123":1,"d20230124":1,"d20230125":1,"d20230126":1,"d20230127":3,"d20230128":3,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":12,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2_5","d20230119":4,"d20230120":2,"d20230121":1,"d20230122":1,"d20230123":1,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":13,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":14,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3_1","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":15,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3_2","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":4,"d20230128":4,"d20230129":4,"d20230130":4,"d20230131":4},{"_id":16,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3_3","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":17,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3_4","d20230119":0,"d20230120":0,"d20230121":1,"d20230122":1,"d20230123":1,"d20230124":1,"d20230125":1,"d20230126":1,"d20230127":3,"d20230128":3,"d20230129":3,"d20230130":3,"d20230131":3},{"_id":18,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3_5","d20230119":0,"d20230120":1,"d20230121":1,"d20230122":1,"d20230123":1,"d20230124":1,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0}] \ No newline at end of file diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py index d35f775e08..c0a26c3f30 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py @@ -291,7 +291,7 @@ def test_get_npi_data(self, mock_codes, mock_read, mock_data): # some columns should be empty # either because they're not mentioned or because the incidence is not exceeded. self.assertEqual( - npis_test.iloc[:, [3, 4, 5, 6, 7, 9, 11, 14, 15, 17]].values.sum(), 0) + npis_test.iloc[:, [3, 5, 6, 7, 9, 11, 14, 15, 17]].values.sum(), 0) # incidence independent NPIs should not have changed self.assertEqual( npis_test.M1_1.to_list(), From 16fdf0ac223eab7afbdae5ded488b44fe8f3faca Mon Sep 17 00:00:00 2001 From: patricklnz Date: Tue, 31 Jan 2023 13:20:32 +0100 Subject: [PATCH 052/104] improve check of removed counties + handle futurewarning --- .../memilio/epidata/getNPIData.py | 45 +++++++------------ 1 file changed, 16 insertions(+), 29 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index d089b0c1cd..c9913c5b1a 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -167,7 +167,7 @@ def read_files(directory, fine_resolution): # Set first five columns so that they match old format of data frame (from kr_massnahmen_unterkategorien.csv) if counter_codes == len(codelist)-1: - df_local[i].iloc[:,0:5] = df_npis_per_code.iloc[i*len(dates),0:5].values + df_local[i][df_local[i].columns[0:5]] = df_npis_per_code.iloc[i*len(dates),0:5].values counter_codes += 1 df_npis_old = pd.concat([df_local[i] for i in range(num_counties)]) @@ -464,6 +464,10 @@ def get_npi_data(fine_resolution=2, if counties_considered == 'All': counties_considered = geoger.get_county_ids() + try: + counties_considered.remove(16056) + except ValueError: + pass directory = out_folder directory = os.path.join(directory, 'Germany/') @@ -723,17 +727,18 @@ def get_npi_data(fine_resolution=2, incidence_thresholds_to_npis[( incval, '_' + code_considered.split('_')[2])].append(i) - # check if more than the county of Eisenach would be removed with - # current county list + # Remove counties which are not considered. Check if all considered counties are in the dataframe counties_removed = df_npis_old[ ~df_npis_old[dd.EngEng['idCounty']].isin(counties_considered)][ dd.EngEng['idCounty']].unique() - if 16056 in counties_considered: - if list(counties_removed) != [16056]: - raise gd.DataError('Error. Other counties than that of Eisenach were removed.') + + if set(counties_considered).difference(counties_removed) == set( + counties_considered) and np.array_equal( + sorted(np.append(counties_considered, counties_removed)), + sorted(df_npis_old[dd.EngEng['idCounty']].unique())): + pass else: - if counties_removed.size > 1: - raise gd.DataError('Error. Other counties than that of Eisenach were removed.') + raise gd.DataError('Error. Considered counties hae been removed.') # remove rows for Eisenach df_npis_old = df_npis_old[df_npis_old[dd.EngEng['idCounty']].isin( counties_considered)].reset_index(drop=True) @@ -1032,40 +1037,22 @@ def get_npi_data(fine_resolution=2, pass #### start validation #### - if fine_resolution == 2 and (npi_activation_days_threshold + npi_lifting_days_threshold == 0): - start_date_validation = datetime(2020, 3, 1) - end_date_validation = datetime(2022, 2, 15) + if fine_resolution == 2 and npi_activation_days_threshold == 1 and npi_lifting_days_threshold == 1 or fine_resolution == 1: for countyID in counties_considered: for npiCode in [ 'M01a_010', 'M01a_150', 'M05_120', 'M01a_010', 'M18_030', 'M01b_020', 'M02b_035', 'M16_050']: - for subcode in [''] + ['_'+str(i) for i in range(1, 6)]: [ a, b, oldf, newf] = validate( df_npis_old, df_npis, df_infec_rki, countyID, - npiCode + subcode, start_npi_cols, npi_incid_start, - start_date_validation, end_date_validation, + npiCode, start_npi_cols, npi_incid_start, + start_date_new, end_date_new, fine_resolution) if (a != b): print('Error in NPI activation computation') print(a, b, a - b) - elif fine_resolution == 1: - start_date_validation = datetime(2020, 3, 1) - end_date_validation = datetime(2022, 2, 15) - - for countyID in counties_considered: - for npiCode in [ - 'M01a_010', 'M01a_150', 'M05_120', 'M01a_010', - 'M18_030', 'M01b_020', 'M02b_035', 'M16_050']: - [a, b, oldf, newf] = validate(df_npis_old, df_npis, - df_infec_rki, countyID, npiCode, start_npi_cols, - npi_incid_start, start_date_validation, - end_date_validation, fine_resolution) - if (a != b): - print('Error in NPI activation computation') - print(a, b, a == b) #### end validation #### if fine_resolution > 0: From 327e36a3bbe49ed42a921a063ce89c0fb906ad0e Mon Sep 17 00:00:00 2001 From: patricklnz Date: Tue, 31 Jan 2023 14:04:20 +0100 Subject: [PATCH 053/104] test exclusion of npis --- .../test_data/TestSetNPIsUnterkategorien.json | 2 +- .../memilio/epidata_test/test_epidata_getNPIData.py | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsUnterkategorien.json b/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsUnterkategorien.json index 0576bfc6fb..fb8d3029ee 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsUnterkategorien.json +++ b/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsUnterkategorien.json @@ -1 +1 @@ -[{"_id":1,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":1,"d20230127":1,"d20230128":1,"d20230129":2,"d20230130":2,"d20230131":1},{"_id":2,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1_1","d20230119":-99,"d20230120":-99,"d20230121":-99,"d20230122":-99,"d20230123":-99,"d20230124":-99,"d20230125":-99,"d20230126":-99,"d20230127":-99,"d20230128":-99,"d20230129":-99,"d20230130":-99,"d20230131":-99},{"_id":3,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1_2","d20230119":0,"d20230120":0,"d20230121":1,"d20230122":1,"d20230123":4,"d20230124":4,"d20230125":4,"d20230126":3,"d20230127":0,"d20230128":-99,"d20230129":-99,"d20230130":-99,"d20230131":-99},{"_id":4,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1_3","d20230119":-99,"d20230120":-99,"d20230121":-99,"d20230122":-99,"d20230123":-99,"d20230124":-99,"d20230125":-99,"d20230126":-99,"d20230127":-99,"d20230128":-99,"d20230129":-99,"d20230130":-99,"d20230131":-99},{"_id":5,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1_4","d20230119":-99,"d20230120":-99,"d20230121":-99,"d20230122":-99,"d20230123":-99,"d20230124":-99,"d20230125":-99,"d20230126":-99,"d20230127":-99,"d20230128":-99,"d20230129":-99,"d20230130":-99,"d20230131":-99},{"_id":6,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1_5","d20230119":-99,"d20230120":-99,"d20230121":-99,"d20230122":-99,"d20230123":-99,"d20230124":-99,"d20230125":-99,"d20230126":-99,"d20230127":-99,"d20230128":-99,"d20230129":-99,"d20230130":-99,"d20230131":-99},{"_id":7,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2","d20230119":1,"d20230120":1,"d20230121":2,"d20230122":2,"d20230123":2,"d20230124":1,"d20230125":1,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":8,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2_1","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":9,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2_2","d20230119":2,"d20230120":2,"d20230121":1,"d20230122":1,"d20230123":1,"d20230124":1,"d20230125":1,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":10,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2_3","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":11,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2_4","d20230119":3,"d20230120":2,"d20230121":1,"d20230122":1,"d20230123":1,"d20230124":1,"d20230125":1,"d20230126":1,"d20230127":3,"d20230128":3,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":12,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2_5","d20230119":4,"d20230120":2,"d20230121":1,"d20230122":1,"d20230123":1,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":13,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":14,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3_1","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":15,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3_2","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":4,"d20230128":4,"d20230129":4,"d20230130":4,"d20230131":4},{"_id":16,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3_3","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":17,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3_4","d20230119":0,"d20230120":0,"d20230121":1,"d20230122":1,"d20230123":1,"d20230124":1,"d20230125":1,"d20230126":1,"d20230127":3,"d20230128":3,"d20230129":3,"d20230130":3,"d20230131":3},{"_id":18,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3_5","d20230119":0,"d20230120":1,"d20230121":1,"d20230122":1,"d20230123":1,"d20230124":1,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0}] \ No newline at end of file +[{"_id":1,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":1,"d20230127":1,"d20230128":1,"d20230129":2,"d20230130":2,"d20230131":1},{"_id":2,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1_1","d20230119":0,"d20230120":0,"d20230121":1,"d20230122":1,"d20230123":4,"d20230124":4,"d20230125":4,"d20230126":3,"d20230127":0,"d20230128":-99,"d20230129":-99,"d20230130":-99,"d20230131":-99},{"_id":3,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1_2","d20230119":-99,"d20230120":-99,"d20230121":-99,"d20230122":-99,"d20230123":-99,"d20230124":-99,"d20230125":-99,"d20230126":-99,"d20230127":-99,"d20230128":-99,"d20230129":-99,"d20230130":-99,"d20230131":-99},{"_id":4,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1_3","d20230119":-99,"d20230120":-99,"d20230121":-99,"d20230122":-99,"d20230123":-99,"d20230124":-99,"d20230125":-99,"d20230126":-99,"d20230127":-99,"d20230128":-99,"d20230129":-99,"d20230130":-99,"d20230131":-99},{"_id":5,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1_4","d20230119":-99,"d20230120":-99,"d20230121":-99,"d20230122":-99,"d20230123":-99,"d20230124":-99,"d20230125":-99,"d20230126":-99,"d20230127":-99,"d20230128":-99,"d20230129":-99,"d20230130":-99,"d20230131":-99},{"_id":6,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_1_5","d20230119":-99,"d20230120":-99,"d20230121":-99,"d20230122":-99,"d20230123":-99,"d20230124":-99,"d20230125":-99,"d20230126":-99,"d20230127":-99,"d20230128":-99,"d20230129":-99,"d20230130":-99,"d20230131":-99},{"_id":7,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2","d20230119":1,"d20230120":1,"d20230121":2,"d20230122":2,"d20230123":2,"d20230124":1,"d20230125":1,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":8,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2_1","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":9,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2_2","d20230119":2,"d20230120":2,"d20230121":1,"d20230122":1,"d20230123":1,"d20230124":1,"d20230125":1,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":10,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2_3","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":11,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2_4","d20230119":3,"d20230120":2,"d20230121":1,"d20230122":1,"d20230123":1,"d20230124":1,"d20230125":1,"d20230126":1,"d20230127":3,"d20230128":3,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":12,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_2_5","d20230119":4,"d20230120":2,"d20230121":1,"d20230122":1,"d20230123":1,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":13,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":14,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3_1","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":15,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3_2","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":4,"d20230128":4,"d20230129":4,"d20230130":4,"d20230131":4},{"_id":16,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3_3","d20230119":0,"d20230120":0,"d20230121":0,"d20230122":0,"d20230123":0,"d20230124":0,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0},{"_id":17,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3_4","d20230119":0,"d20230120":0,"d20230121":1,"d20230122":1,"d20230123":1,"d20230124":1,"d20230125":1,"d20230126":1,"d20230127":3,"d20230128":3,"d20230129":3,"d20230130":3,"d20230131":3},{"_id":18,"ags2":1,"bundesland":"Schleswig-Holstein","ags5":1001,"kreis":"Flensburg, Stadt","code":"M1_3_5","d20230119":0,"d20230120":1,"d20230121":1,"d20230122":1,"d20230123":1,"d20230124":1,"d20230125":0,"d20230126":0,"d20230127":0,"d20230128":0,"d20230129":0,"d20230130":0,"d20230131":0}] \ No newline at end of file diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py index c0a26c3f30..21ad2a8426 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py @@ -23,7 +23,7 @@ import os import pandas as pd -import numpy as np +import numpy as np from datetime import date @@ -291,7 +291,7 @@ def test_get_npi_data(self, mock_codes, mock_read, mock_data): # some columns should be empty # either because they're not mentioned or because the incidence is not exceeded. self.assertEqual( - npis_test.iloc[:, [3, 5, 6, 7, 9, 11, 14, 15, 17]].values.sum(), 0) + npis_test.iloc[:, [4, 5, 6, 7, 9, 11, 14, 15, 17]].values.sum(), 0) # incidence independent NPIs should not have changed self.assertEqual( npis_test.M1_1.to_list(), @@ -303,8 +303,9 @@ def test_get_npi_data(self, mock_codes, mock_read, mock_data): npis_test.M1_3.to_list(), [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) # test remaining_columns + # incidence depending NPIs can first be activated on day 4 due to activation_days_threshold=3 # incidence is constantly > 20 - #M1_1_1,M1_2_1,M1_3_1,M1_1_2,M1_3_2 always 0 + # M1_2_1,M1_3_1,M1_1_2,M1_3_2 always 0 self.assertEqual( npis_test.M1_2_2.tolist(), [0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]) @@ -317,6 +318,11 @@ def test_get_npi_data(self, mock_codes, mock_read, mock_data): [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]) #_5 is always 0 since incidence is never > 100 in this test + #M1_1_1 should not be active when M2,3_2,3,4,5 is active + self.assertEqual( + npis_test.M1_1_1.to_list(), + [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]) + if __name__ == '__main__': unittest.main() From 1e77caac7ff682589c2a95bd877ffc03a53ab6c6 Mon Sep 17 00:00:00 2001 From: patricklnz Date: Tue, 31 Jan 2023 14:05:57 +0100 Subject: [PATCH 054/104] suggested change --- pycode/memilio-epidata/memilio/epidata/getNPIData.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index c9913c5b1a..9a8a1c9e6e 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -1037,7 +1037,7 @@ def get_npi_data(fine_resolution=2, pass #### start validation #### - if fine_resolution == 2 and npi_activation_days_threshold == 1 and npi_lifting_days_threshold == 1 or fine_resolution == 1: + if fine_resolution > 0 and npi_activation_days_threshold == 1 and npi_lifting_days_threshold == 1: for countyID in counties_considered: for npiCode in [ From f7ba4edf2aaddc895b17791d0b8e4c1b6bc2b0b0 Mon Sep 17 00:00:00 2001 From: Anna Wendler Date: Wed, 1 Feb 2023 16:34:33 +0100 Subject: [PATCH 055/104] Adapt documentation of activate_npis --- .../memilio-epidata/memilio/epidata/getNPIData.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 9a8a1c9e6e..6d02c97b1d 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -250,12 +250,13 @@ def activate_npis_based_on_incidence(local_incid, npi_lifting_days_threshold, np For a formerly active NPI to be lifted, the incidence has to be below the threshold for npi_lifting_days_threshold many days. - If one of the former cases holds true, then the activation or lifting - happens the day following the satisfaction of the criterion. This is in - accordance with case reporting that can only happen after the day has - finished and as these reports generally appeared in the morning, the NPI - can not directly be activated or lifted that same day. Please see the - examples for a better understanding. + If one of the former cases holds true, then the activation or lifting happens + two days after the satisfaction of the criterion. This is in accordance with + case reporting that can only happen after the day has finished and as these + reports generally appeared in the morning for the previous day, the NPI can + not directly be activated or lifted that day but only on the next day. Hence + the incidence-dependent NPI is activated or lifted two days after the threshold + is/ is not exceeded. Please see the examples for a better understanding. Example (Threshold=3.5): local_incid=pd.Series([2, 4, 2, 4, 2, 2, 4, 4, 2, 4, 2, 2, 2, 2]) From 141c629bb25c96bf7e8a3e0d9476eeb2454cfc6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Thu, 2 Feb 2023 11:26:26 +0100 Subject: [PATCH 056/104] exclusion and reduction of contradictory NPIs --- .../memilio/epidata/getNPIData.py | 50 ++++++++++++++++--- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 6d02c97b1d..8868777507 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -497,12 +497,16 @@ def get_npi_data(fine_resolution=2, npi_codes_prior = df_npis_desc['Variablenname'] npi_codes_prior_desc = df_npis_desc['Variable'] - # for fine_resolution > 0 deactivation of non-combinable - # incidence-dependent NPIs has to be conducted; therefore we defined a - # matrix of possible combinations of NPIs (marked with an X if combinable) - # NPIs of different main category (e.g., M01a and M04) can always be - # combined; only those of, e.g., M01a_010_3 and M01a_080_4 can exclude each - # other + # For fine_resolution > 0 deactivation of non-combinable / conflicting + # (incidence-dependent) NPIs has to be conducted. + # For all NPIs, we defined a matrix of possible combinations of NPIs; + # marked with an X if combinable. + # Exclusion of NPIs of different main category (e.g., M01a and M04) + # happens based on this table and the strictness index provided by Corona- + # Datenplattform. Those of, e.g., M01a_010_3 and M01a_080_4 exclude each + # other according to the threshold they were prescribed with. Active NPIs + # with low incidence thresholds deactivate conflicting NPIs with higher + # thresholds. if fine_resolution > 0: num_nonexistent_codes_pre = df_npis_combinations_pre['Variablenname'].str.count("M22|M23|M24").sum() if num_nonexistent_codes_pre !=0: @@ -529,6 +533,13 @@ def get_npi_data(fine_resolution=2, list( npi_groups_combinations [npi_groups_combinations == code].index)) + # read and save strictness order for NPIs of each main categorie + df_npis_strictness_index = { + npi_groups_combinations_unique[i]: + # TODO: Replace strictness of j for all NPIs by some value from Datenplattform... + {df_npis_combinations_pre['Variablenname'][npi_groups_idx[0]][j] : j for j in range(len(df_npis_combinations_pre['Variablenname'][npi_groups_idx[0]]))} + for i in range(len(npi_groups_combinations_unique))} + # create hash table of main code to contained codes and combination matrix df_npis_combinations = { npi_groups_combinations_unique[i]: @@ -858,6 +869,26 @@ def get_npi_data(fine_resolution=2, df_local_old = df_npis_old[df_npis_old[dd.EngEng['idCounty']] == countyID].copy() + # Consistency of incidence dependent NPIs: + # The same NPI should not be prescribed multiple times at the same day + # for different thresholds. In order to avoid contradictions, only + # retain the strictest mentioned implementation. + for i in range(int(len(df_local_old)/6)): + sum_npi_inc = np.where(df_local_old.iloc[6*i+1:6*(i+1),6:].sum()>1) + if len(sum_npi_inc[0]): + print('Reduce multiple prescription of NPI ' + str(npis.loc[i, 'Description']) + ' for county ' + str(countyID)) + for j in sum_npi_inc[0]: + # get lowest index (i.e., strictest implementation of NPI). + idx_start = np.where(df_local_old.iloc[6*i+1:6*(i+1),6+j])[0].min() + # Remove less strict and thus contradictory + # implementations of the same NPI the same day. + df_local_old.iloc[6*i+1+idx_start+1:6*(i+1),6+j] = 0 + + if not all(df_local_old.iloc[6*i+1:6*(i+1),6+sum_npi_inc[0]].sum()==1): + raise gd.DataError('Consistency correction failed.') + + ## end of consistency correction ## + # potentially remove rows if they are not in npis dict npi_rows = [i in npis[dd.EngEng['npiCode']].values for i in df_local_old[dd.EngEng['npiCode']]] @@ -981,6 +1012,13 @@ def get_npi_data(fine_resolution=2, df_local_new.loc[indicator_code_active_idx, subcode_excl + level_other[1]] = 0 + # TODO: + # Remove conflicting non-incidence dependent NPIs according to + # strictness index of Corona-Datenplattform and exclusion + # criteria defined in df_npis_combinations + for code in df_npis_combinations.keys(): + code_cols = df_npis_combinations[code].columns + # reduction of factor space NPI x incidence threshold to NPI # by max aggregation of all incidence threshold columns per NPI if fine_resolution == 1: From 6d5220d0667622447c706272c194a7b402394fa1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Thu, 2 Feb 2023 16:04:11 +0100 Subject: [PATCH 057/104] deactivation of contradictory NPIs --- .../memilio/epidata/getNPIData.py | 244 ++++++++++++------ 1 file changed, 162 insertions(+), 82 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 8868777507..4c929f41a3 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -118,7 +118,7 @@ def print_manual_download(filename, url): def read_files(directory, fine_resolution): """! Reads files from local directory and returns data in dataframes - + @param directory Directory where data is loaded from. @param fine_resolution 2 [Default] or 0 or 1. Defines which categories are considered. @@ -132,13 +132,15 @@ def read_files(directory, fine_resolution): if fine_resolution > 0: try: try: - codelist = ['m01a', 'm01b', 'm02a', 'm02b', 'm03', 'm04', 'm05', 'm06', 'm07', 'm08', 'm09', - 'm10', 'm11', 'm12', 'm13', 'm14', 'm15', 'm16', 'm17', 'm18', 'm19', 'm20', 'm21'] + codelist = [ + 'm01a', 'm01b', 'm02a', 'm02b', 'm03', 'm04', 'm05', 'm06', + 'm07', 'm08', 'm09', 'm10', 'm11', 'm12', 'm13', 'm14', + 'm15', 'm16', 'm17', 'm18', 'm19', 'm20', 'm21'] counter_codes = 0 for code in codelist: df_npis_per_code = pd.read_csv( os.path.join(directory, - 'kr_massn_unterkat_{}.csv'.format(code)), + 'kr_massn_unterkat_{}.csv'.format(code)), sep=',') # set some parameters for dataframe @@ -147,32 +149,47 @@ def read_files(directory, fine_resolution): num_counties = len(df_npis_per_code.ags5.unique()) # extract dates from data - dates = df_npis_per_code.iloc[:int(df_npis_per_code.shape[0]/num_counties), 5] + dates = df_npis_per_code.iloc[:int( + df_npis_per_code.shape[0]/num_counties), 5] # rename dates so that they match dates from other npi dataframe - dates_new = ['d' + date.replace('-', '') for date in dates] + dates_new = [ + 'd' + date.replace('-', '') for date in dates] + + df_local = [pd.DataFrame() + for i in range(num_counties)] - df_local = [pd.DataFrame() for i in range(num_counties)] - # set df for all counties - for i in range(0,num_counties): + for i in range(0, num_counties): if counter_codes == 0: - df_local[i] = pd.DataFrame(columns=list(df_npis_per_code.columns[0:5]) + ['code'] + dates_new) + df_local[i] = pd.DataFrame( + columns=list(df_npis_per_code.columns[0: 5]) + + ['code'] + dates_new) - dummy_to_append = pd.DataFrame(columns=['code'] + dates_new, data=df_npis_per_code[df_npis_per_code.ags5 == counties[i]].iloc[:, 6:].T.reset_index().values.copy()) + dummy_to_append = pd.DataFrame( + columns=['code'] + dates_new, + data=df_npis_per_code + [df_npis_per_code.ags5 == counties[i]]. + iloc[:, 6:].T.reset_index().values.copy()) df_local[i] = pd.concat([df_local[i], dummy_to_append]) - - if df_npis_per_code.iloc[i*len(dates):(i+1)*len(dates),3].nunique() > 1: - raise gd.DataError('Dates are not sorted as expected.') + + if df_npis_per_code.iloc[i * len(dates): (i + 1) * + len(dates), + 3].nunique() > 1: + raise gd.DataError( + 'Dates are not sorted as expected.') # Set first five columns so that they match old format of data frame (from kr_massnahmen_unterkategorien.csv) if counter_codes == len(codelist)-1: - df_local[i][df_local[i].columns[0:5]] = df_npis_per_code.iloc[i*len(dates),0:5].values + df_local[i][df_local[i].columns[0:5] + ] = df_npis_per_code.iloc[i*len(dates), 0:5].values counter_codes += 1 - df_npis_old = pd.concat([df_local[i] for i in range(num_counties)]) + df_npis_old = pd.concat([df_local[i] + for i in range(num_counties)]) df_npis_old.rename(dd.GerEng, axis=1, inplace=True) - df_npis_old['NPI_code'] = df_npis_old['NPI_code'].str.replace('code_m', 'M') + df_npis_old['NPI_code'] = df_npis_old['NPI_code'].str.replace( + 'code_m', 'M') except FileNotFoundError: df_npis_old = pd.read_csv( os.path.join( @@ -185,7 +202,6 @@ def read_files(directory, fine_resolution): 'https://www.corona-datenplattform.de/dataset/massnahmen_unterkategorien_kreise') raise FileNotFoundError - # check if rows hospitals and geriatric care are still empty; # these fields have been empty so far and are thus not used test_codes = ['M23_010', 'M23_020', 'M23_030', 'M23_040', @@ -215,24 +231,24 @@ def read_files(directory, fine_resolution): df_npis_desc = pd.read_excel( os.path.join( directory, 'datensatzbeschreibung_massnahmen.xlsx'), - sheet_name=2, engine = 'openpyxl') + sheet_name=2, engine='openpyxl') else: df_npis_desc = pd.read_excel( os.path.join( directory, 'datensatzbeschreibung_massnahmen.xlsx'), - sheet_name=3, engine = 'openpyxl') + sheet_name=3, engine='openpyxl') except FileNotFoundError: print_manual_download( 'datensatzbeschreibung_massnahmen.xlsx', 'https://www.corona-datenplattform.de/dataset/massnahmen_unterkategorien_kreise') raise FileNotFoundError - + # download combinations of npis try: if fine_resolution > 0: df_npis_combinations_pre = pd.read_excel( os.path.join( - directory, 'combination_npis.xlsx'), engine = 'openpyxl') + directory, 'combination_npis.xlsx'), engine='openpyxl') except FileNotFoundError: print('File not found.') raise FileNotFoundError @@ -240,7 +256,9 @@ def read_files(directory, fine_resolution): return df_npis_old, df_npis_desc, df_npis_combinations_pre -def activate_npis_based_on_incidence(local_incid, npi_lifting_days_threshold, npi_activation_days_threshold, incid_threshold): +def activate_npis_based_on_incidence( + local_incid, npi_lifting_days_threshold, npi_activation_days_threshold, + incid_threshold): """! Computes an activation vector according to a given incidence threshold, observed incidence and activation or lifting delays. @@ -276,7 +294,7 @@ def activate_npis_based_on_incidence(local_incid, npi_lifting_days_threshold, np With npi_lifting_days_threshold=3, npi_activation_days_threshold=2 NPI should be activated on day 9 (and lifted on day 15; not in the vector) [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] - + Another example: With yesterday's incidence over threshold on days: [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0] @@ -284,7 +302,7 @@ def activate_npis_based_on_incidence(local_incid, npi_lifting_days_threshold, np NPI should be activated on day 2 and lifted on day 14 int_active should then be: [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0] - + Please also note that the first column will always returned as false so the dataframe should not start with dates where NPIs are implemented. For the Corona Datenplattform frame which starts from 2020-03-01 @@ -298,7 +316,8 @@ def activate_npis_based_on_incidence(local_incid, npi_lifting_days_threshold, np """ if npi_lifting_days_threshold < 1 or npi_activation_days_threshold < 1: - raise ValueError('Activation or lifting day variables need to be 1 or larger') + raise ValueError( + 'Activation or lifting day variables need to be 1 or larger') # First get a Series with 0 for yesterdays incidence # is below threshold and 1 for incidence over threshold @@ -312,21 +331,23 @@ def activate_npis_based_on_incidence(local_incid, npi_lifting_days_threshold, np for i in range(len(yesterdays_incid_over_threshold)): # Set int_active=0 where last npi_lifting_days_threshold+1 days did not exceed # the threshold - if yesterdays_incid_over_threshold[max(0,i-npi_lifting_days_threshold):i].values.sum() == 0: + if yesterdays_incid_over_threshold[max(0, i-npi_lifting_days_threshold):i].values.sum() == 0: int_active[i] = 0 # Set int_active=1 where last npi_activation_days_threshold+1 days did # all exceed the threshold - elif yesterdays_incid_over_threshold[max(0,i-npi_activation_days_threshold):i].values.sum() == npi_activation_days_threshold: + elif yesterdays_incid_over_threshold[max(0, i-npi_activation_days_threshold):i].values.sum() == npi_activation_days_threshold: int_active[i] = 1 # If no condition applies, set int_active to the value of the previous day - elif i>0: # for i=0, int_active always will be zero (see comment above) + # for i=0, int_active always will be zero (see comment above) + elif i > 0: int_active[i] = int_active[i-1] # elif i==0 int active is 0 return int_active -def drop_codes_and_categories(npi_codes_prior, npi_codes_prior_desc, df_npis_old, fine_resolution): +def drop_codes_and_categories( + npi_codes_prior, npi_codes_prior_desc, df_npis_old, fine_resolution): """! Drops codes and categories from original data frame if they are not used. @@ -374,7 +395,7 @@ def drop_codes_and_categories(npi_codes_prior, npi_codes_prior_desc, df_npis_old missing_grouped_codes.append(mcode) if len(missing_grouped_codes) > 0: # only MCODE_NUMBER codes raise gd.DataError('Missing NPI codes: ' + - str(missing_grouped_codes)) + str(missing_grouped_codes)) else: raise gd.DataError('Missing NPI codes: ' + str(missing_codes)) @@ -399,10 +420,9 @@ def drop_codes_and_categories(npi_codes_prior, npi_codes_prior_desc, df_npis_old raise gd.DataError('Error in NPI names, please check.') else: # no dropping for fine_resolution == 0 - codes_dropped=[] - - return codes_dropped, npi_codes_prior, df_npis_old + codes_dropped = [] + return codes_dropped, npi_codes_prior, df_npis_old def get_npi_data(fine_resolution=2, @@ -411,8 +431,8 @@ def get_npi_data(fine_resolution=2, start_date=dd.defaultDict['start_date'], end_date=dd.defaultDict['end_date'], counties_considered=geoger.get_county_ids(), - npi_activation_days_threshold = 3, - npi_lifting_days_threshold = 5 + npi_activation_days_threshold=3, + npi_lifting_days_threshold=5 ): """! Loads a certain resolution of recorded NPI data from the Corona Datenplattform and extracts the counties asked for and @@ -458,9 +478,9 @@ def get_npi_data(fine_resolution=2, of days below case incidence threshold threshold to lift NPIs. """ - # Depending on the federal state and time period, there are - # huge differences for number of days before the lifting and activation. - # It was usually between 1 and 14 days. We use npi_lifting_days_threshold = 5 + # Depending on the federal state and time period, there are + # huge differences for number of days before the lifting and activation. + # It was usually between 1 and 14 days. We use npi_lifting_days_threshold = 5 # and npi_activation_days_threshold = 3 as default averaged value. if counties_considered == 'All': @@ -475,9 +495,10 @@ def get_npi_data(fine_resolution=2, gd.check_dir(directory) # read manual downloaded files from directory - df_npis_old, df_npis_desc, df_npis_combinations_pre = read_files(directory, fine_resolution) + df_npis_old, df_npis_desc, df_npis_combinations_pre = read_files( + directory, fine_resolution) - # get existing codes that are used + # get existing codes that are used # for fine resolution we don't have codes M22 - M24 but are still listed in description if fine_resolution > 0: # count how many codes contain M22, M23 or M24 @@ -501,16 +522,18 @@ def get_npi_data(fine_resolution=2, # (incidence-dependent) NPIs has to be conducted. # For all NPIs, we defined a matrix of possible combinations of NPIs; # marked with an X if combinable. - # Exclusion of NPIs of different main category (e.g., M01a and M04) + # Exclusion of NPIs of different main category (e.g., M01a and M04) # happens based on this table and the strictness index provided by Corona- # Datenplattform. Those of, e.g., M01a_010_3 and M01a_080_4 exclude each # other according to the threshold they were prescribed with. Active NPIs - # with low incidence thresholds deactivate conflicting NPIs with higher + # with low incidence thresholds deactivate conflicting NPIs with higher # thresholds. if fine_resolution > 0: - num_nonexistent_codes_pre = df_npis_combinations_pre['Variablenname'].str.count("M22|M23|M24").sum() - if num_nonexistent_codes_pre !=0: - df_npis_combinations_pre = df_npis_combinations_pre.iloc[:-num_nonexistent_codes_pre,:] + num_nonexistent_codes_pre = df_npis_combinations_pre['Variablenname'].str.count( + "M22|M23|M24").sum() + if num_nonexistent_codes_pre != 0: + df_npis_combinations_pre = df_npis_combinations_pre.iloc[: - + num_nonexistent_codes_pre, :] # rename essential columns and throw away others column_names = ['Unnamed: ' + str(i) for i in range(3, 19)] rename_columns = {column_names[i]: i for i in range(len(column_names))} @@ -537,8 +560,9 @@ def get_npi_data(fine_resolution=2, df_npis_strictness_index = { npi_groups_combinations_unique[i]: # TODO: Replace strictness of j for all NPIs by some value from Datenplattform... - {df_npis_combinations_pre['Variablenname'][npi_groups_idx[0]][j] : j for j in range(len(df_npis_combinations_pre['Variablenname'][npi_groups_idx[0]]))} - for i in range(len(npi_groups_combinations_unique))} + {df_npis_combinations_pre['Variablenname'][npi_groups_idx[0]][j]: j for j in range( + len(df_npis_combinations_pre['Variablenname'][npi_groups_idx[0]]))} + for i in range(len(npi_groups_combinations_unique))} # create hash table of main code to contained codes and combination matrix df_npis_combinations = { @@ -596,7 +620,7 @@ def get_npi_data(fine_resolution=2, df_in_valid = pd.read_excel( os.path.join( directory, 'combinations_npis_cleanoutput.xlsx'), - sheet_name=i, engine = 'openpyxl') + sheet_name=i, engine='openpyxl') if not df_in_valid.drop(columns='Unnamed: 0').equals(df_out): print('Error in combination matrix.') del df_in_valid @@ -660,13 +684,18 @@ def get_npi_data(fine_resolution=2, if fine_resolution > 0: for code in df_npis_combinations.keys(): local_codes_used_rows = df_npis_combinations[code][1].Code.isin( - npis.NPI_code) + npis['NPI_code']) local_codes_used_cols = df_npis_combinations[code][1].columns.isin( - npis.NPI_code) + npis['NPI_code']) # overwrite item 0 since codes are stored in *.columns df_npis_combinations[code] = df_npis_combinations[code][1].loc[local_codes_used_rows, - local_codes_used_cols].reset_index(drop=True).copy() + local_codes_used_cols].reset_index(drop=True).copy() + + # also remove strictness indices of unused codes + df_npis_strictness_index[code] = { + key: val for key, val in df_npis_strictness_index[code].items() + if key in npis['NPI_code'].values} # prepare grouping of NPIs to reduce product space of # NPI x active_from_inc (with values "incidence does not matter", and @@ -792,7 +821,7 @@ def get_npi_data(fine_resolution=2, df_population = pd.read_json( directory + "county_current_population.json") except: - df_population=gpd.get_population_data() + df_population = gpd.get_population_data() min_date.append( df_infec_rki[dd.EngEng['date']].min().to_pydatetime()) max_date.append( @@ -857,10 +886,9 @@ def get_npi_data(fine_resolution=2, df_infec_local = df_infec_local[(df_infec_local[dd.EngEng['date']] >= start_date_new) & ( df_infec_local[dd.EngEng['date']] <= end_date_new)].reset_index() - local_incid = df_infec_local['Incidence'].copy() # Count counties with start cases >= 1: - # In this case NPI activation cannot be ensured to work as expected + # In this case NPI activation cannot be ensured to work as expected if cases_first_value >= 1: counter_cases_start += 1 @@ -874,17 +902,24 @@ def get_npi_data(fine_resolution=2, # for different thresholds. In order to avoid contradictions, only # retain the strictest mentioned implementation. for i in range(int(len(df_local_old)/6)): - sum_npi_inc = np.where(df_local_old.iloc[6*i+1:6*(i+1),6:].sum()>1) + sum_npi_inc = np.where( + df_local_old.iloc[6*i+1:6*(i+1), 6:].sum() > 1) if len(sum_npi_inc[0]): - print('Reduce multiple prescription of NPI ' + str(npis.loc[i, 'Description']) + ' for county ' + str(countyID)) + print( + 'Reduce multiple prescription in county ' + str(countyID) + + ' for NPI ' + str(npis.loc[i, 'Description'])) for j in sum_npi_inc[0]: # get lowest index (i.e., strictest implementation of NPI). - idx_start = np.where(df_local_old.iloc[6*i+1:6*(i+1),6+j])[0].min() + idx_start = np.where( + df_local_old.iloc[6*i+1:6*(i+1), 6+j])[0].min() # Remove less strict and thus contradictory # implementations of the same NPI the same day. - df_local_old.iloc[6*i+1+idx_start+1:6*(i+1),6+j] = 0 + df_local_old.iloc[6*i+1+idx_start+1:6*(i+1), 6+j] = 0 - if not all(df_local_old.iloc[6*i+1:6*(i+1),6+sum_npi_inc[0]].sum()==1): + if not all( + df_local_old.iloc + [6 * i + 1: 6 * (i + 1), + 6 + sum_npi_inc[0]].sum() == 1): raise gd.DataError('Consistency correction failed.') ## end of consistency correction ## @@ -893,8 +928,7 @@ def get_npi_data(fine_resolution=2, npi_rows = [i in npis[dd.EngEng['npiCode']].values for i in df_local_old[dd.EngEng['npiCode']]] - - # create columns for date, county ID + # create columns for date, county ID df_local_new = pd.DataFrame( columns=[dd.EngEng['date']] + [dd.EngEng['idCounty']]) @@ -951,7 +985,9 @@ def get_npi_data(fine_resolution=2, if level[0] >= 0: # level[0] = incidvalthrsh # get days where npis are active as int (1/0) - int_active = activate_npis_based_on_incidence(local_incid, npi_lifting_days_threshold, npi_activation_days_threshold, level[0]) + int_active = activate_npis_based_on_incidence( + local_incid, npi_lifting_days_threshold, + npi_activation_days_threshold, level[0]) # multiply rows of data frame by either 1 if threshold # passed (i.e., mentioned NPI is active) or zero @@ -988,7 +1024,7 @@ def get_npi_data(fine_resolution=2, # extract codes subcodes_nocombi = df_npis_combinations[code].loc[scidx, :] # only consider those codes which cannot be - # combined; for these values of 1 have to be + # combined; for these, values of 1 have to be # set to 0 subcodes_nocombi = list( subcodes_nocombi @@ -1012,12 +1048,53 @@ def get_npi_data(fine_resolution=2, df_local_new.loc[indicator_code_active_idx, subcode_excl + level_other[1]] = 0 - # TODO: - # Remove conflicting non-incidence dependent NPIs according to + # TODO: (maybe also remove incidence-dependent here? to discuss!) + # Remove conflicting non-incidence (?) dependent NPIs according to # strictness index of Corona-Datenplattform and exclusion # criteria defined in df_npis_combinations for code in df_npis_combinations.keys(): - code_cols = df_npis_combinations[code].columns + # get other subcodes MX_Y_* below current main code MX_Y + subcodes_list = list(df_npis_strictness_index[code].keys()) + # sort index reversely with the strictest (highest) index first + idx_strictness_sorted_rev = np.argsort( + list(df_npis_strictness_index[code].values()))[::-1] + + for jj in range(len(idx_strictness_sorted_rev)-1): + # get index of NPI of a certain strictness + idx_strictness = idx_strictness_sorted_rev[jj] + # get code of corresponding NPI + subcode = subcodes_list[idx_strictness] + + # get indices of days where subcode is active + subcode_active = np.where(df_local_new.loc[:,subcode]>0)[0] + + if len(subcode_active>0): + # get indices of less strict NPIs + idxs_less_strict = np.sort(idx_strictness_sorted_rev[jj+1:]) + + # extract true/false list of combination of subcodes + subcodes_nocombi = df_npis_combinations[code].loc[:, subcode] + # only consider those codes which cannot be combined; + # for these, values of 1 have to be set to 0 + subcodes_nocombi = list( + subcodes_nocombi + [subcodes_nocombi == 0].index) + + # intersect non-combinable subcodes with less strict subcodes + idx_subcodes_deactivation = np.sort(list( + set(idxs_less_strict).intersection(subcodes_nocombi))) + + for kk in idx_subcodes_deactivation: + days_deact = np.where(df_local_new.loc[subcode_active, [subcodes_list[kk] + str(appendix[1]) for appendix in incidence_thresholds]].sum(axis=1)>0)[0] + if len(days_deact) > 0: + print('Deactivating for ' + 'County ' + str(countyID)) + print('\t' + str(subcodes_list[kk]) + ' due to ' + str(subcode) + ' on ' + str(len(days_deact)) + ' days.') + print('\n') + df_local_new.loc[subcode_active, + [subcodes_list[kk] + + str(appendix[1]) + for appendix in + incidence_thresholds]] = 0 # reduction of factor space NPI x incidence threshold to NPI # by max aggregation of all incidence threshold columns per NPI @@ -1037,7 +1114,10 @@ def get_npi_data(fine_resolution=2, start_time = time.perf_counter() - df_npis = pd.concat([df_npis.copy(), df_local_new.copy()], ignore_index=True) + df_npis = pd.concat( + [df_npis.copy(), + df_local_new.copy()], + ignore_index=True) counters[cid] += time.perf_counter()-start_time cid += 1 @@ -1054,11 +1134,11 @@ def get_npi_data(fine_resolution=2, str(int(time_remain / 60)) + ' min.') if counter_cases_start >= len(counties_considered)*0.05: print('WARNING: DataFrame starts with reported cases > 0 ' - 'for more than 5 percent of the counties to be considered. ' - 'In this case, incidence computation and activation of ' - 'incidence-dependent NPIs cannot be ensured to work correctly. ' - 'Please consider a start date of some weeks ahead of the ' - 'time window to be analyzed for NPI\'s effects.') + 'for more than 5 percent of the counties to be considered. ' + 'In this case, incidence computation and activation of ' + 'incidence-dependent NPIs cannot be ensured to work correctly. ' + 'Please consider a start date of some weeks ahead of the ' + 'time window to be analyzed for NPI\'s effects.') # print sub counters print('Sub task counters are: ') @@ -1082,15 +1162,15 @@ def get_npi_data(fine_resolution=2, for npiCode in [ 'M01a_010', 'M01a_150', 'M05_120', 'M01a_010', 'M18_030', 'M01b_020', 'M02b_035', 'M16_050']: - [ - a, b, oldf, newf] = validate( - df_npis_old, df_npis, df_infec_rki, countyID, - npiCode, start_npi_cols, npi_incid_start, - start_date_new, end_date_new, - fine_resolution) - if (a != b): - print('Error in NPI activation computation') - print(a, b, a - b) + [ + a, b, oldf, newf] = validate( + df_npis_old, df_npis, df_infec_rki, countyID, + npiCode, start_npi_cols, npi_incid_start, + start_date_new, end_date_new, + fine_resolution) + if (a != b): + print('Error in NPI activation computation') + print(a, b, a - b) #### end validation #### From e555d7acf33aea856d971abee397821ae30b47f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Fri, 3 Feb 2023 12:49:03 +0100 Subject: [PATCH 058/104] strictness and correction combis --- pycode/memilio-epidata/memilio/epidata/getNPIData.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 4c929f41a3..a9d35d1d7d 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -560,8 +560,8 @@ def get_npi_data(fine_resolution=2, df_npis_strictness_index = { npi_groups_combinations_unique[i]: # TODO: Replace strictness of j for all NPIs by some value from Datenplattform... - {df_npis_combinations_pre['Variablenname'][npi_groups_idx[0]][j]: j for j in range( - len(df_npis_combinations_pre['Variablenname'][npi_groups_idx[0]]))} + {df_npis_combinations_pre['Variablenname'][npi_groups_idx[i]].values[j]: j for j in range( + len(df_npis_combinations_pre['Variablenname'][npi_groups_idx[i]]))} for i in range(len(npi_groups_combinations_unique))} # create hash table of main code to contained codes and combination matrix @@ -570,7 +570,7 @@ def get_npi_data(fine_resolution=2, [ list( df_npis_combinations_pre['Variablenname'] - [npi_groups_idx[0]]), + [npi_groups_idx[i]]), np.eye(len(npi_groups_idx[i]))] for i in range(len(npi_groups_combinations_unique))} From 5fc47caaca056c8d04c127df636a88d3bf68d218 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Fri, 3 Feb 2023 12:51:21 +0100 Subject: [PATCH 059/104] code length reduc --- .../memilio-epidata/memilio/epidata/getNPIData.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index a9d35d1d7d..1ad7f40f77 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -581,16 +581,17 @@ def get_npi_data(fine_resolution=2, for i in range(len(npi_groups_idx)): codes_local = df_npis_combinations_pre.loc[npi_groups_idx[i], 'Variablenname'].values - df_npis_combinations[npi_groups_combinations_unique[i]][1] = df_npis_combinations_pre.iloc[npi_groups_idx[i], - start_comb_matrix:start_comb_matrix+len(npi_groups_idx[i])].values - if (df_npis_combinations[npi_groups_combinations_unique[i]][1]-np.transpose(df_npis_combinations[npi_groups_combinations_unique[i]][1])).max() > 0: + npic_uniq = npi_groups_combinations_unique[i] # reduce code length + df_npis_combinations[npic_uniq][1] = df_npis_combinations_pre.iloc[ + npi_groups_idx[i], start_comb_matrix:start_comb_matrix+len(npi_groups_idx[i])].values + if (df_npis_combinations[npic_uniq][1]-np.transpose(df_npis_combinations[npic_uniq][1])).max() > 0: print('Error in input file: Please correct combination matrix input.') # make it a dataframe to allow easy removal of code lines and rows # if they are not used later on - df_npis_combinations[npi_groups_combinations_unique[i]][1] = pd.DataFrame( - df_npis_combinations[npi_groups_combinations_unique[i]][1], + df_npis_combinations[npic_uniq][1] = pd.DataFrame( + df_npis_combinations[npic_uniq][1], columns=codes_local) - df_npis_combinations[npi_groups_combinations_unique[i]][1].insert( + df_npis_combinations[npic_uniq][1].insert( 0, 'Code', codes_local) del df_npis_combinations_pre From 8fc4c5e646ca808b77bb50188c78942e22cc91d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Sat, 4 Feb 2023 00:27:56 +0100 Subject: [PATCH 060/104] rework strictness --- .../memilio/epidata/getNPIData.py | 95 ++++++++++--------- 1 file changed, 52 insertions(+), 43 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 1ad7f40f77..c285689faf 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -248,7 +248,7 @@ def read_files(directory, fine_resolution): if fine_resolution > 0: df_npis_combinations_pre = pd.read_excel( os.path.join( - directory, 'combination_npis.xlsx'), engine='openpyxl') + directory, 'combination_npis_incl_ranking.xlsx'), engine='openpyxl') except FileNotFoundError: print('File not found.') raise FileNotFoundError @@ -519,15 +519,22 @@ def get_npi_data(fine_resolution=2, npi_codes_prior_desc = df_npis_desc['Variable'] # For fine_resolution > 0 deactivation of non-combinable / conflicting - # (incidence-dependent) NPIs has to be conducted. - # For all NPIs, we defined a matrix of possible combinations of NPIs; - # marked with an X if combinable. - # Exclusion of NPIs of different main category (e.g., M01a and M04) - # happens based on this table and the strictness index provided by Corona- - # Datenplattform. Those of, e.g., M01a_010_3 and M01a_080_4 exclude each - # other according to the threshold they were prescribed with. Active NPIs - # with low incidence thresholds deactivate conflicting NPIs with higher - # thresholds. + # NPIs has to be conducted. + # + # NPIs of different main categories (e.g., M01a and M04) can always be + # prescribed together as they target different locations and settings. + # + # NPIs with the same main code (i.e., targeting the same location, e.g., + # schools, or the same set of NPIs, e.g., masks) can exclude each other. + # Exclusion happens based on table provided in xlsx or csv format. + # + # In first place, NPIs of higher stringency index as defined by the Corona- + # Datenplattform deactivate NPIs with lower stringency index. + # NPIs of the same main code and with the same stringency index may or + # may not exclude each other according to the threshold they were + # prescribed with. Prescribed and active NPIs with high incidence thresholds + # deactivate conflicting NPIs with lower thresholds (as the latter are + # considered to be less strict). if fine_resolution > 0: num_nonexistent_codes_pre = df_npis_combinations_pre['Variablenname'].str.count( "M22|M23|M24").sum() @@ -535,11 +542,12 @@ def get_npi_data(fine_resolution=2, df_npis_combinations_pre = df_npis_combinations_pre.iloc[: - num_nonexistent_codes_pre, :] # rename essential columns and throw away others - column_names = ['Unnamed: ' + str(i) for i in range(3, 19)] + columns_combinations = np.where((df_npis_combinations_pre=='x').any()==True)[0] + column_names = ['Unnamed: ' + str(i) for i in range(columns_combinations[0], columns_combinations[-1]+1)] rename_columns = {column_names[i]: i for i in range(len(column_names))} df_npis_combinations_pre.rename(columns=rename_columns, inplace=True) df_npis_combinations_pre = df_npis_combinations_pre[[ - 'Variablenname'] + [i for i in range(0, 16)]] + 'Variablenname', 'Massnahmenindex'] + [i for i in range(0, len(columns_combinations))]] # replace empty cells by zeros and x-marked cells by ones df_npis_combinations_pre = df_npis_combinations_pre.replace(np.nan, 0) df_npis_combinations_pre = df_npis_combinations_pre.replace('x', 1) @@ -556,34 +564,33 @@ def get_npi_data(fine_resolution=2, list( npi_groups_combinations [npi_groups_combinations == code].index)) - # read and save strictness order for NPIs of each main categorie - df_npis_strictness_index = { - npi_groups_combinations_unique[i]: - # TODO: Replace strictness of j for all NPIs by some value from Datenplattform... - {df_npis_combinations_pre['Variablenname'][npi_groups_idx[i]].values[j]: j for j in range( - len(df_npis_combinations_pre['Variablenname'][npi_groups_idx[i]]))} - for i in range(len(npi_groups_combinations_unique))} - # create hash table of main code to contained codes and combination matrix + # TODO: look at: + # np.where(df_npis_old[df_npis_old.NPI_code.isin(pd.Series(['M16_100', 'M16_100_1', 'M16_100_2', 'M16_100_3', 'M16_100_4', 'M16_100_5']))].iloc[:,6:]==1) + # np.where(df_npis_old[df_npis_old.NPI_code.isin(pd.Series(['M01a_020', 'M01a_020_1', 'M01a_020_2', 'M01a_020_3', 'M01a_020_4', 'M01a_020_5']))].iloc[:,6:]==1) + # np.where(df_npis_old[df_npis_old.NPI_code.isin(pd.Series(['M01b_020', 'M01b_020_1', 'M01b_020_2', 'M01b_020_3', 'M01b_020_4', 'M01b_020_5']))].iloc[:,6:]==1) + + # create hash table of main code to strictness rankings inside main + # code and combination matrix inside the same strictness rank df_npis_combinations = { npi_groups_combinations_unique[i]: [ - list( - df_npis_combinations_pre['Variablenname'] - [npi_groups_idx[i]]), + {df_npis_combinations_pre['Variablenname'][npi_groups_idx[i]].values[j]: + df_npis_combinations_pre['Massnahmenindex'][npi_groups_idx[i]].values[j] for j in range( + len(df_npis_combinations_pre['Variablenname'][npi_groups_idx[i]]))}, np.eye(len(npi_groups_idx[i]))] for i in range(len(npi_groups_combinations_unique))} + # run through all groups and set possible combinations according to # read combination matrix - start_comb_matrix = list( - df_npis_combinations_pre.columns).index('Variablenname')+1 + start_comb_matrix = list(df_npis_combinations_pre.columns).index('Variablenname')+3 for i in range(len(npi_groups_idx)): codes_local = df_npis_combinations_pre.loc[npi_groups_idx[i], 'Variablenname'].values - npic_uniq = npi_groups_combinations_unique[i] # reduce code length - df_npis_combinations[npic_uniq][1] = df_npis_combinations_pre.iloc[ - npi_groups_idx[i], start_comb_matrix:start_comb_matrix+len(npi_groups_idx[i])].values + npic_uniq = npi_groups_combinations_unique[i] # reduce code length + df_npis_combinations[npic_uniq][1] = df_npis_combinations_pre.iloc[np.array(npi_groups_idx[i]), + start_comb_matrix:start_comb_matrix+len(npi_groups_idx[i])].values if (df_npis_combinations[npic_uniq][1]-np.transpose(df_npis_combinations[npic_uniq][1])).max() > 0: print('Error in input file: Please correct combination matrix input.') # make it a dataframe to allow easy removal of code lines and rows @@ -689,14 +696,15 @@ def get_npi_data(fine_resolution=2, local_codes_used_cols = df_npis_combinations[code][1].columns.isin( npis['NPI_code']) - # overwrite item 0 since codes are stored in *.columns - df_npis_combinations[code] = df_npis_combinations[code][1].loc[local_codes_used_rows, + # remove strictness indices of unused codes + df_npis_combinations[code][0] = { + key: val for key, val in df_npis_combinations[code][0].items() + if key in npis['NPI_code'].values} + # remove columns of combinations + df_npis_combinations[code][1] = df_npis_combinations[code][1].loc[local_codes_used_rows, local_codes_used_cols].reset_index(drop=True).copy() - # also remove strictness indices of unused codes - df_npis_strictness_index[code] = { - key: val for key, val in df_npis_strictness_index[code].items() - if key in npis['NPI_code'].values} + # prepare grouping of NPIs to reduce product space of # NPI x active_from_inc (with values "incidence does not matter", and @@ -1009,7 +1017,7 @@ def get_npi_data(fine_resolution=2, level_lower = [lev for lev in levels_exclusion if lev[0] < level[0]] for code in df_npis_combinations.keys(): - code_cols = df_npis_combinations[code].columns + code_cols = df_npis_combinations[code][1].columns # iterate over subcode indices for scidx in range(len(code_cols)-1): # check if code was used, otherwise nothing to @@ -1023,7 +1031,7 @@ def get_npi_data(fine_resolution=2, indicator_code_active > 0)[0] if len(indicator_code_active_idx) > 0: # extract codes - subcodes_nocombi = df_npis_combinations[code].loc[scidx, :] + subcodes_nocombi = df_npis_combinations[code][1].loc[scidx, :] # only consider those codes which cannot be # combined; for these, values of 1 have to be # set to 0 @@ -1040,12 +1048,12 @@ def get_npi_data(fine_resolution=2, # where NPI code_cols[scidx] + level[1] # is active if df_local_new.loc[indicator_code_active_idx, subcode_excl + level_other[1]].any(): - # print('Deactivating for ' + 'County ' + str(countyID) + ' and Incidence ' + str(level_other[0])) - # print('\t' + npi_codes_prior_desc[npi_codes_prior[npi_codes_prior==subcode_excl + level_other[1]].index].values[0]) - # print('Due to Incidence > ' + str(level[0]) + ' and NPI ') - # print('\t' + npi_codes_prior_desc[npi_codes_prior[npi_codes_prior==code_cols[scidx] + level[1]].index].values[0]) + print('Deactivating for ' + 'County ' + str(countyID) + ' and Incidence ' + str(level_other[0])) + print('\t' + npi_codes_prior_desc[npi_codes_prior[npi_codes_prior==subcode_excl + level_other[1]].index].values[0]) + print('Due to Incidence > ' + str(level[0]) + ' and NPI ') + print('\t' + npi_codes_prior_desc[npi_codes_prior[npi_codes_prior==code_cols[scidx] + level[1]].index].values[0]) # print(list(df_local_new.loc[indicator_code_active_idx,'Date'])) - # print('\n') + print('\n') df_local_new.loc[indicator_code_active_idx, subcode_excl + level_other[1]] = 0 @@ -1055,10 +1063,10 @@ def get_npi_data(fine_resolution=2, # criteria defined in df_npis_combinations for code in df_npis_combinations.keys(): # get other subcodes MX_Y_* below current main code MX_Y - subcodes_list = list(df_npis_strictness_index[code].keys()) + subcodes_list = list(df_npis_combinations[code][0].keys()) # sort index reversely with the strictest (highest) index first idx_strictness_sorted_rev = np.argsort( - list(df_npis_strictness_index[code].values()))[::-1] + list(df_npis_combinations[code][0].values()))[::-1] for jj in range(len(idx_strictness_sorted_rev)-1): # get index of NPI of a certain strictness @@ -1074,6 +1082,7 @@ def get_npi_data(fine_resolution=2, idxs_less_strict = np.sort(idx_strictness_sorted_rev[jj+1:]) # extract true/false list of combination of subcodes + # TODO [0] or [1] ? subcodes_nocombi = df_npis_combinations[code].loc[:, subcode] # only consider those codes which cannot be combined; # for these, values of 1 have to be set to 0 From ede2917450cf19b9288af5b7dc14bbba96d65f74 Mon Sep 17 00:00:00 2001 From: patricklnz Date: Tue, 21 Feb 2023 13:17:30 +0100 Subject: [PATCH 061/104] fix test --- .../memilio/epidata/getNPIData.py | 25 ++++++++++++------- .../test_data/TestSetNPIsCombinations.json | 2 +- .../epidata_test/test_epidata_getNPIData.py | 9 +++++-- 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index c285689faf..ec1c86b43e 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -541,8 +541,14 @@ def get_npi_data(fine_resolution=2, if num_nonexistent_codes_pre != 0: df_npis_combinations_pre = df_npis_combinations_pre.iloc[: - num_nonexistent_codes_pre, :] + + # drop 0 column if existent + try: + df_npis_combinations_pre.drop(columns=0) + except KeyError: + pass # rename essential columns and throw away others - columns_combinations = np.where((df_npis_combinations_pre=='x').any()==True)[0] + columns_combinations = np.where((df_npis_combinations_pre=='x').any()==True)[0] #maybe rename columns_used ? column_names = ['Unnamed: ' + str(i) for i in range(columns_combinations[0], columns_combinations[-1]+1)] rename_columns = {column_names[i]: i for i in range(len(column_names))} df_npis_combinations_pre.rename(columns=rename_columns, inplace=True) @@ -584,7 +590,7 @@ def get_npi_data(fine_resolution=2, # run through all groups and set possible combinations according to # read combination matrix - start_comb_matrix = list(df_npis_combinations_pre.columns).index('Variablenname')+3 + start_comb_matrix = list(df_npis_combinations_pre.columns).index('Variablenname')+2 for i in range(len(npi_groups_idx)): codes_local = df_npis_combinations_pre.loc[npi_groups_idx[i], 'Variablenname'].values @@ -1048,12 +1054,13 @@ def get_npi_data(fine_resolution=2, # where NPI code_cols[scidx] + level[1] # is active if df_local_new.loc[indicator_code_active_idx, subcode_excl + level_other[1]].any(): - print('Deactivating for ' + 'County ' + str(countyID) + ' and Incidence ' + str(level_other[0])) - print('\t' + npi_codes_prior_desc[npi_codes_prior[npi_codes_prior==subcode_excl + level_other[1]].index].values[0]) - print('Due to Incidence > ' + str(level[0]) + ' and NPI ') - print('\t' + npi_codes_prior_desc[npi_codes_prior[npi_codes_prior==code_cols[scidx] + level[1]].index].values[0]) - # print(list(df_local_new.loc[indicator_code_active_idx,'Date'])) - print('\n') + if subcode_excl != 'M03_020': + print('Deactivating for ' + 'County ' + str(countyID) + ' and Incidence ' + str(level_other[0])) + print('\t' + npi_codes_prior_desc[npi_codes_prior[npi_codes_prior==subcode_excl + level_other[1]].index].values[0]) + print('Due to Incidence > ' + str(level[0]) + ' and NPI ') + print('\t' + npi_codes_prior_desc[npi_codes_prior[npi_codes_prior==code_cols[scidx] + level[1]].index].values[0]) + # print(list(df_local_new.loc[indicator_code_active_idx,'Date'])) + print('\n') df_local_new.loc[indicator_code_active_idx, subcode_excl + level_other[1]] = 0 @@ -1083,7 +1090,7 @@ def get_npi_data(fine_resolution=2, # extract true/false list of combination of subcodes # TODO [0] or [1] ? - subcodes_nocombi = df_npis_combinations[code].loc[:, subcode] + subcodes_nocombi = df_npis_combinations[code][1].loc[:, subcode] # only consider those codes which cannot be combined; # for these, values of 1 have to be set to 0 subcodes_nocombi = list( diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsCombinations.json b/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsCombinations.json index a76f59d32c..cd10c7055d 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsCombinations.json +++ b/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsCombinations.json @@ -1 +1 @@ -[{"Unnamed: 0":0,"Variablenname":"M1_1","Variable":"Beschr\u00e4nkung X","Unnamed: 3":"x","Unnamed: 4":null,"Unnamed: 5":null,"Unnamed: 6":null,"Unnamed: 7":null,"Unnamed: 8":null,"Unnamed: 9":null,"Unnamed: 10":null,"Unnamed: 11":null,"Unnamed: 12":null,"Unnamed: 13":null,"Unnamed: 14":null,"Unnamed: 15":null,"Unnamed: 16":null,"Unnamed: 17":null,"Unnamed: 18":null},{"Unnamed: 0":6,"Variablenname":"M1_2","Variable":"Beschr\u00e4nkung Y","Unnamed: 3":null,"Unnamed: 4":"x","Unnamed: 5":"x","Unnamed: 6":null,"Unnamed: 7":null,"Unnamed: 8":null,"Unnamed: 9":null,"Unnamed: 10":null,"Unnamed: 11":null,"Unnamed: 12":null,"Unnamed: 13":null,"Unnamed: 14":null,"Unnamed: 15":null,"Unnamed: 16":null,"Unnamed: 17":null,"Unnamed: 18":null},{"Unnamed: 0":12,"Variablenname":"M1_3","Variable":"Beschr\u00e4nkung Z","Unnamed: 3":null,"Unnamed: 4":"x","Unnamed: 5":"x","Unnamed: 6":null,"Unnamed: 7":null,"Unnamed: 8":null,"Unnamed: 9":null,"Unnamed: 10":null,"Unnamed: 11":null,"Unnamed: 12":null,"Unnamed: 13":null,"Unnamed: 14":null,"Unnamed: 15":null,"Unnamed: 16":null,"Unnamed: 17":null,"Unnamed: 18":null}] \ No newline at end of file +[{"Unnamed: 0":0,"Variablenname":"M1_1","Massnahmenindex":0,"Variable":"Beschr\u00e4nkung X","Unnamed: 4":"x","Unnamed: 5":null,"Unnamed: 6":null},{"Unnamed: 0":6,"Variablenname":"M1_2","Massnahmenindex":1,"Variable":"Beschr\u00e4nkung Y","Unnamed: 4":null,"Unnamed: 5":"x","Unnamed: 6":"x"},{"Unnamed: 0":12,"Variablenname":"M1_3","Massnahmenindex":2,"Variable":"Beschr\u00e4nkung Z","Unnamed: 4":null,"Unnamed: 5":"x","Unnamed: 6":"x"}] \ No newline at end of file diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py index 21ad2a8426..b6b8e9d8e8 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py @@ -291,7 +291,7 @@ def test_get_npi_data(self, mock_codes, mock_read, mock_data): # some columns should be empty # either because they're not mentioned or because the incidence is not exceeded. self.assertEqual( - npis_test.iloc[:, [4, 5, 6, 7, 9, 11, 14, 15, 17]].values.sum(), 0) + npis_test.iloc[:, [4, 5, 6, 7, 9, 11, 13, 15, 17, 19]].values.sum(), 0) # incidence independent NPIs should not have changed self.assertEqual( npis_test.M1_1.to_list(), @@ -309,10 +309,15 @@ def test_get_npi_data(self, mock_codes, mock_read, mock_data): self.assertEqual( npis_test.M1_2_2.tolist(), [0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]) + + self.assertEqual( + npis_test.M1_3_2.tolist(), + [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) #_4 -> Incidence > 50 + # M1_3_4 is always 0 - cant be simultaneously active with M1_3_2 self.assertEqual( npis_test.M1_3_4.to_list(), - [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]) + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) self.assertEqual( npis_test.M1_2_4.to_list(), [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]) From 4145f63b32747307ea17a683c7eb669758ed8d8b Mon Sep 17 00:00:00 2001 From: patricklnz Date: Tue, 21 Feb 2023 15:18:18 +0100 Subject: [PATCH 062/104] remove debugging artifact --- .../memilio-epidata/memilio/epidata/getNPIData.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index ec1c86b43e..269cc88568 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -1054,13 +1054,12 @@ def get_npi_data(fine_resolution=2, # where NPI code_cols[scidx] + level[1] # is active if df_local_new.loc[indicator_code_active_idx, subcode_excl + level_other[1]].any(): - if subcode_excl != 'M03_020': - print('Deactivating for ' + 'County ' + str(countyID) + ' and Incidence ' + str(level_other[0])) - print('\t' + npi_codes_prior_desc[npi_codes_prior[npi_codes_prior==subcode_excl + level_other[1]].index].values[0]) - print('Due to Incidence > ' + str(level[0]) + ' and NPI ') - print('\t' + npi_codes_prior_desc[npi_codes_prior[npi_codes_prior==code_cols[scidx] + level[1]].index].values[0]) - # print(list(df_local_new.loc[indicator_code_active_idx,'Date'])) - print('\n') + print('Deactivating for ' + 'County ' + str(countyID) + ' and Incidence ' + str(level_other[0])) + print('\t' + npi_codes_prior_desc[npi_codes_prior[npi_codes_prior==subcode_excl + level_other[1]].index].values[0]) + print('Due to Incidence > ' + str(level[0]) + ' and NPI ') + print('\t' + npi_codes_prior_desc[npi_codes_prior[npi_codes_prior==code_cols[scidx] + level[1]].index].values[0]) + # print(list(df_local_new.loc[indicator_code_active_idx,'Date'])) + print('\n') df_local_new.loc[indicator_code_active_idx, subcode_excl + level_other[1]] = 0 From 1aa1865c74a1ded2639216114d90cfa7f44886e2 Mon Sep 17 00:00:00 2001 From: patricklnz Date: Mon, 27 Feb 2023 15:37:17 +0100 Subject: [PATCH 063/104] rework strictness deactivation --- .../memilio/epidata/getNPIData.py | 153 +++++++----------- 1 file changed, 57 insertions(+), 96 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 269cc88568..b236170df1 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -22,6 +22,9 @@ import os import pandas as pd import numpy as np +import warnings + +warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning) from memilio.epidata import getDataIntoPandasDataFrame as gd from memilio.epidata import geoModificationGermany as geoger @@ -590,7 +593,7 @@ def get_npi_data(fine_resolution=2, # run through all groups and set possible combinations according to # read combination matrix - start_comb_matrix = list(df_npis_combinations_pre.columns).index('Variablenname')+2 + start_comb_matrix = list(df_npis_combinations_pre.columns).index('Variablenname')+3 for i in range(len(npi_groups_idx)): codes_local = df_npis_combinations_pre.loc[npi_groups_idx[i], 'Variablenname'].values @@ -915,10 +918,10 @@ def get_npi_data(fine_resolution=2, # Consistency of incidence dependent NPIs: # The same NPI should not be prescribed multiple times at the same day # for different thresholds. In order to avoid contradictions, only - # retain the strictest mentioned implementation. - for i in range(int(len(df_local_old)/6)): + # retain the strictest mentioned implementation. #! wording strictness + for i in range(int(len(df_local_old)/6)): # check if only same subcode #! maybe consider main code ? sum_npi_inc = np.where( - df_local_old.iloc[6*i+1:6*(i+1), 6:].sum() > 1) + df_local_old.iloc[6*i:6*(i+1), 6:].sum() > 1) if len(sum_npi_inc[0]): print( 'Reduce multiple prescription in county ' + str(countyID) + @@ -926,14 +929,14 @@ def get_npi_data(fine_resolution=2, for j in sum_npi_inc[0]: # get lowest index (i.e., strictest implementation of NPI). idx_start = np.where( - df_local_old.iloc[6*i+1:6*(i+1), 6+j])[0].min() + df_local_old.iloc[6*i:6*(i+1), 6+j])[0].min() # Remove less strict and thus contradictory # implementations of the same NPI the same day. - df_local_old.iloc[6*i+1+idx_start+1:6*(i+1), 6+j] = 0 + df_local_old.iloc[6*i+idx_start+1:6*(i+1), 6+j] = 0 if not all( df_local_old.iloc - [6 * i + 1: 6 * (i + 1), + [6 * i : 6 * (i + 1), 6 + sum_npi_inc[0]].sum() == 1): raise gd.DataError('Consistency correction failed.') @@ -1012,84 +1015,42 @@ def get_npi_data(fine_resolution=2, df_local_new.iloc[:, npis_idx_start + np.array(npi_indices)] \ = df_local_new.iloc[:, npis_idx_start + np.array(npi_indices)].mul(int_active, axis=0) - # if new, dynamic NPIs for higher incidence (more restrictions, - # i.e., stricter) cannot be combined with previous, dynamic - # NPIs for lower indices (less restrictions, less strict), - # the latter have to be deactivated - # (incidence_thresholds_to_npis.keys() has to be sorted !) - levels_exclusion = list(reversed(incidence_thresholds_to_npis.keys()))[ - 0:-1] # level<0 means non-incidence dependent and always active - for level in levels_exclusion: - level_lower = [lev for lev in levels_exclusion - if lev[0] < level[0]] - for code in df_npis_combinations.keys(): - code_cols = df_npis_combinations[code][1].columns - # iterate over subcode indices - for scidx in range(len(code_cols)-1): - # check if code was used, otherwise nothing to - # exclude, i.e. no combination possible anyway. - indicator_code_active = df_local_new.loc[:, - code_cols - [scidx] + - level - [1]] - indicator_code_active_idx = np.where( - indicator_code_active > 0)[0] - if len(indicator_code_active_idx) > 0: - # extract codes - subcodes_nocombi = df_npis_combinations[code][1].loc[scidx, :] - # only consider those codes which cannot be - # combined; for these, values of 1 have to be - # set to 0 - subcodes_nocombi = list( - subcodes_nocombi - [subcodes_nocombi == 0].index) - # iterate over exclusive subcodes - for subcode_excl in subcodes_nocombi: - # iterate over less strict dynamic NPIs - # i.e., where threshold is higher - for level_other in level_lower: - # deactivate potential NPIs (with code: - # subcode_excl + level_other[1]) on days - # where NPI code_cols[scidx] + level[1] - # is active - if df_local_new.loc[indicator_code_active_idx, subcode_excl + level_other[1]].any(): - print('Deactivating for ' + 'County ' + str(countyID) + ' and Incidence ' + str(level_other[0])) - print('\t' + npi_codes_prior_desc[npi_codes_prior[npi_codes_prior==subcode_excl + level_other[1]].index].values[0]) - print('Due to Incidence > ' + str(level[0]) + ' and NPI ') - print('\t' + npi_codes_prior_desc[npi_codes_prior[npi_codes_prior==code_cols[scidx] + level[1]].index].values[0]) - # print(list(df_local_new.loc[indicator_code_active_idx,'Date'])) - print('\n') - df_local_new.loc[indicator_code_active_idx, - subcode_excl + level_other[1]] = 0 - - # TODO: (maybe also remove incidence-dependent here? to discuss!) - # Remove conflicting non-incidence (?) dependent NPIs according to - # strictness index of Corona-Datenplattform and exclusion - # criteria defined in df_npis_combinations + # merge incidence dependent NPIs to have only one column for each subcode + df_merged=df_local_new.iloc[:,:2].copy() for code in df_npis_combinations.keys(): - # get other subcodes MX_Y_* below current main code MX_Y - subcodes_list = list(df_npis_combinations[code][0].keys()) + for i in range(len(df_npis_combinations[code][1].columns)): + df_merged[df_npis_combinations[code][1].columns[i] + ] = df_local_new.iloc[:, 2+(i*6):2+(i*6)+6].sum(axis=1) + # strictness deactivation is done with this merged dataframe + + if df_merged.max()[2:].max() > 1: + raise gd.DataError('Error in merging...') + + # Remove conflicting NPIs according to strictness index of Corona-Datenplattform and exclusion criteria + # defined in df_npis_combinations + for maincode in df_npis_combinations.keys(): + # get all subcodes + subcodes = list(df_npis_combinations[maincode][0].keys()) # sort index reversely with the strictest (highest) index first idx_strictness_sorted_rev = np.argsort( - list(df_npis_combinations[code][0].values()))[::-1] - - for jj in range(len(idx_strictness_sorted_rev)-1): + list(df_npis_combinations[maincode][0].values()))[::-1] + for i in range(len(idx_strictness_sorted_rev)-1): # get index of NPI of a certain strictness - idx_strictness = idx_strictness_sorted_rev[jj] + idx_strictness = idx_strictness_sorted_rev[i] # get code of corresponding NPI - subcode = subcodes_list[idx_strictness] + subcode = subcodes[idx_strictness] + + # get subcode index + scidx = subcodes.index(subcode) # get indices of days where subcode is active - subcode_active = np.where(df_local_new.loc[:,subcode]>0)[0] + subcode_active = np.where(df_merged.loc[:,subcode]>0)[0] - if len(subcode_active>0): + if len(subcode_active) > 0: # get indices of less strict NPIs - idxs_less_strict = np.sort(idx_strictness_sorted_rev[jj+1:]) + idxs_less_strict = df_npis_combinations[maincode][1].columns[np.sort(idx_strictness_sorted_rev[i+1:])] - # extract true/false list of combination of subcodes - # TODO [0] or [1] ? - subcodes_nocombi = df_npis_combinations[code][1].loc[:, subcode] + subcodes_nocombi = df_npis_combinations[maincode][1].loc[scidx, :] # only consider those codes which cannot be combined; # for these, values of 1 have to be set to 0 subcodes_nocombi = list( @@ -1097,32 +1058,31 @@ def get_npi_data(fine_resolution=2, [subcodes_nocombi == 0].index) # intersect non-combinable subcodes with less strict subcodes - idx_subcodes_deactivation = np.sort(list( - set(idxs_less_strict).intersection(subcodes_nocombi))) + subcodes_deactivation = list(set(idxs_less_strict).intersection(subcodes_nocombi)) + + # check if all codes which can not be combined get deactivated + for code in subcodes_nocombi: + if code not in subcodes_deactivation: + print('WARNING!') + #raise gd.DataError('Can't deactivate NPI code ' + code + ' in county ' + countyID) - for kk in idx_subcodes_deactivation: - days_deact = np.where(df_local_new.loc[subcode_active, [subcodes_list[kk] + str(appendix[1]) for appendix in incidence_thresholds]].sum(axis=1)>0)[0] + for nocombi_code in subcodes_deactivation: + days_deact = np.where(df_merged.loc[subcode_active, nocombi_code]>0)[0] if len(days_deact) > 0: print('Deactivating for ' + 'County ' + str(countyID)) - print('\t' + str(subcodes_list[kk]) + ' due to ' + str(subcode) + ' on ' + str(len(days_deact)) + ' days.') + print('\t' + str(nocombi_code) + ' due to ' + str(subcode) + ' on ' + str(len(days_deact)) + ' days.') print('\n') - df_local_new.loc[subcode_active, - [subcodes_list[kk] + - str(appendix[1]) - for appendix in - incidence_thresholds]] = 0 - - # reduction of factor space NPI x incidence threshold to NPI - # by max aggregation of all incidence threshold columns per NPI + df_merged.loc[subcode_active, nocombi_code] = 0 + + # for fine resolution = 1 only consider merged dataframe if fine_resolution == 1: - for main_code, codes_group in maincode_to_npicodes_map.items(): - # group by incidence (former codes X1_Y, X1_Z were transformed - # to X1, X2) and write max value to main code column - df_local_new.loc[:, main_code] = df_local_new.loc[:, codes_group].max( - axis=1) - # remove subcategory columns - df_local_new = df_local_new.loc[:, [ - dd.EngEng['date'], dd.EngEng['idCounty']] + npi_codes_aggregated].copy() + df_local_new = df_merged.copy() + else: + # multiply subcode columns with incidence dependent subcode columns in df_local_new + for maincode in df_npis_combinations.keys: + for subcode in df_npis_combinations[maincode][1].columns: + for incidcode in ['','_1','_2','_3','_4','_5']: + df_local_new[subcode+incidcode]*=df_merged[subcode] counters[cid] += time.perf_counter()-start_time cid += 1 @@ -1148,6 +1108,7 @@ def get_npi_data(fine_resolution=2, str(len(counties_considered)) + '. Estimated time remaining: ' + str(int(time_remain / 60)) + ' min.') + if counter_cases_start >= len(counties_considered)*0.05: print('WARNING: DataFrame starts with reported cases > 0 ' 'for more than 5 percent of the counties to be considered. ' From f5d6ffe898dab56c2172170bee7b9051cc711ffb Mon Sep 17 00:00:00 2001 From: patricklnz Date: Tue, 28 Feb 2023 13:49:03 +0100 Subject: [PATCH 064/104] remove warning --- pycode/memilio-epidata/memilio/epidata/getNPIData.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index b236170df1..b3e6e058a9 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -1060,12 +1060,6 @@ def get_npi_data(fine_resolution=2, # intersect non-combinable subcodes with less strict subcodes subcodes_deactivation = list(set(idxs_less_strict).intersection(subcodes_nocombi)) - # check if all codes which can not be combined get deactivated - for code in subcodes_nocombi: - if code not in subcodes_deactivation: - print('WARNING!') - #raise gd.DataError('Can't deactivate NPI code ' + code + ' in county ' + countyID) - for nocombi_code in subcodes_deactivation: days_deact = np.where(df_merged.loc[subcode_active, nocombi_code]>0)[0] if len(days_deact) > 0: @@ -1079,7 +1073,7 @@ def get_npi_data(fine_resolution=2, df_local_new = df_merged.copy() else: # multiply subcode columns with incidence dependent subcode columns in df_local_new - for maincode in df_npis_combinations.keys: + for maincode in df_npis_combinations.keys(): for subcode in df_npis_combinations[maincode][1].columns: for incidcode in ['','_1','_2','_3','_4','_5']: df_local_new[subcode+incidcode]*=df_merged[subcode] From 62fc0fb1880108e8e89448ef4619fb8a08114b5b Mon Sep 17 00:00:00 2001 From: patricklnz Date: Tue, 7 Mar 2023 10:49:34 +0100 Subject: [PATCH 065/104] adapt test to new deactivation method --- pycode/memilio-epidata/memilio/epidata/getNPIData.py | 2 +- .../epidata_test/test_data/TestSetNPIsCombinations.json | 2 +- .../memilio/epidata_test/test_epidata_getNPIData.py | 7 ++++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index b3e6e058a9..0ad12c69ce 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -593,7 +593,7 @@ def get_npi_data(fine_resolution=2, # run through all groups and set possible combinations according to # read combination matrix - start_comb_matrix = list(df_npis_combinations_pre.columns).index('Variablenname')+3 + start_comb_matrix = list(df_npis_combinations_pre.columns).index('Variablenname')+2 for i in range(len(npi_groups_idx)): codes_local = df_npis_combinations_pre.loc[npi_groups_idx[i], 'Variablenname'].values diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsCombinations.json b/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsCombinations.json index cd10c7055d..aab4ec18d7 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsCombinations.json +++ b/pycode/memilio-epidata/memilio/epidata_test/test_data/TestSetNPIsCombinations.json @@ -1 +1 @@ -[{"Unnamed: 0":0,"Variablenname":"M1_1","Massnahmenindex":0,"Variable":"Beschr\u00e4nkung X","Unnamed: 4":"x","Unnamed: 5":null,"Unnamed: 6":null},{"Unnamed: 0":6,"Variablenname":"M1_2","Massnahmenindex":1,"Variable":"Beschr\u00e4nkung Y","Unnamed: 4":null,"Unnamed: 5":"x","Unnamed: 6":"x"},{"Unnamed: 0":12,"Variablenname":"M1_3","Massnahmenindex":2,"Variable":"Beschr\u00e4nkung Z","Unnamed: 4":null,"Unnamed: 5":"x","Unnamed: 6":"x"}] \ No newline at end of file +[{"Unnamed: 0":0,"Variablenname":"M1_1","Massnahmenindex":0,"Variable":"Beschr\u00e4nkung X","empty":0,"Unnamed: 5":"x","Unnamed: 6":null,"Unnamed: 7":null},{"Unnamed: 0":6,"Variablenname":"M1_2","Massnahmenindex":1,"Variable":"Beschr\u00e4nkung Y","empty":0,"Unnamed: 5":null,"Unnamed: 6":"x","Unnamed: 7":"x"},{"Unnamed: 0":12,"Variablenname":"M1_3","Massnahmenindex":2,"Variable":"Beschr\u00e4nkung Z","empty":0,"Unnamed: 5":null,"Unnamed: 6":"x","Unnamed: 7":"x"}] \ No newline at end of file diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py index b6b8e9d8e8..a3dc77f84e 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py @@ -295,7 +295,7 @@ def test_get_npi_data(self, mock_codes, mock_read, mock_data): # incidence independent NPIs should not have changed self.assertEqual( npis_test.M1_1.to_list(), - [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) + [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]) self.assertEqual( npis_test.M1_2.to_list(), [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]) @@ -306,9 +306,10 @@ def test_get_npi_data(self, mock_codes, mock_read, mock_data): # incidence depending NPIs can first be activated on day 4 due to activation_days_threshold=3 # incidence is constantly > 20 # M1_2_1,M1_3_1,M1_1_2,M1_3_2 always 0 + #M1_2_2 is 0 if M1_2 is 1 self.assertEqual( npis_test.M1_2_2.tolist(), - [0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]) + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) self.assertEqual( npis_test.M1_3_2.tolist(), @@ -326,7 +327,7 @@ def test_get_npi_data(self, mock_codes, mock_read, mock_data): #M1_1_1 should not be active when M2,3_2,3,4,5 is active self.assertEqual( npis_test.M1_1_1.to_list(), - [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]) + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) if __name__ == '__main__': From d8dc328f72bc4f69a19772fcc01b696abb53c870 Mon Sep 17 00:00:00 2001 From: patricklnz Date: Mon, 27 Mar 2023 14:18:04 +0200 Subject: [PATCH 066/104] Count joined codes --- .../memilio/epidata/getNPIData.py | 57 ++++++++++++++++++- 1 file changed, 54 insertions(+), 3 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 0ad12c69ce..1422327296 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -23,6 +23,8 @@ import pandas as pd import numpy as np import warnings +import matplotlib as mpl +import matplotlib.pyplot as plt warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning) @@ -608,9 +610,7 @@ def get_npi_data(fine_resolution=2, df_npis_combinations[npic_uniq][1], columns=codes_local) df_npis_combinations[npic_uniq][1].insert( - 0, 'Code', codes_local) - - del df_npis_combinations_pre + 0, 'Code', codes_local) # use to_excel function and specify the sheet_name and index # to store the dataframe in specified sheet if file not yet existent @@ -874,6 +874,17 @@ def get_npi_data(fine_resolution=2, df_npis_old.replace([-99, 2, 3, 4, 5], [0, 1, 1, 1, 1], inplace=True) counter_cases_start = 0 + df_npis_multiple_codes = { + npi_groups_combinations_unique[i]: + [ + {df_npis_combinations_pre['Variablenname'][npi_groups_idx[i]].values[j]: + df_npis_combinations_pre['Massnahmenindex'][npi_groups_idx[i]].values[j] for j in range( + len(df_npis_combinations_pre['Variablenname'][npi_groups_idx[i]]))}, + np.zeros((len(npi_groups_idx[i]), len(npi_groups_idx[i])))] + for i in range(len(npi_groups_combinations_unique))} + + count_codes(df_npis_old, df_npis_multiple_codes) + for countyID in counties_considered: cid = 0 countyidx += 1 @@ -1156,6 +1167,46 @@ def get_npi_data(fine_resolution=2, return df_npis +def count_codes(df_npis_old, df_npis_combinations): + directory = os.path.join(dd.defaultDict['out_folder'], 'Germany/') + for county in geoger.get_county_ids(): + df_local = df_npis_old[df_npis_old[dd.EngEng['idCounty']]==county] + code_dict = {} + for code in df_npis_combinations.keys(): + for column in df_npis_combinations[code][1]: + code_dict[column] = df_local.iloc[:,6+np.where(df_local[df_local.NPI_code.str.contains(column)].iloc[:,6:].max() > 0)[0]].columns + + for code in df_npis_combinations.keys(): + column_list = df_npis_combinations[code][1].columns + for column in range(len(column_list)): + for column_other in range(len(column_list)): + df_npis_combinations[code][1].iloc[column, column_other] += len(set(code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) + df_npis_combinations[code][1].iloc[column_other, column] += len(set(code_dict[column_list[column_other]]).intersection(set(code_dict[column_list[column]]))) + + writer = pd.ExcelWriter(os.path.join(directory,'joined_codes.xlsx'), engine='xlsxwriter') + for code in df_npis_combinations.keys(): + df_npis_combinations[code][1].to_excel(writer, sheet_name=code) + writer.close() + + colors1 = np.array([[1.,1.,1.,1.]]) + colors2 = mpl.cm.get_cmap('cool')(np.linspace(0,1,255)) + colors = np.vstack((colors1, colors2)) + cmap = mpl.colors.LinearSegmentedColormap.from_list('colormap', colors) + + for code in df_npis_combinations.keys(): + df = pd.read_excel(os.path.join(directory, 'joined_codes.xlsx'), sheet_name = code, engine='openpyxl') + array_exclusion = df.iloc[:,1:].to_numpy() + fig = plt.figure() + positions = [i for i in range(len(df.columns)-1)] + plt.xticks(positions, [colname[-3:] for colname in df.columns.to_list()[1:]]) + plt.yticks(positions, df.columns.to_list()[1:]) + plt.imshow(array_exclusion, cmap=cmap, vmin = 0) + plt.colorbar() + plt.savefig(os.path.join(directory, 'heatmap_joined_codes', 'joined_codes_{}'.format(code))) + plt.close() + + + def main(): """! Main program entry.""" From 797f147066abf25e8587ac71471441690b36d1e5 Mon Sep 17 00:00:00 2001 From: patricklnz Date: Thu, 30 Mar 2023 11:05:36 +0200 Subject: [PATCH 067/104] review changes --- .../memilio/epidata/getNPIData.py | 99 +++++++++++-------- 1 file changed, 57 insertions(+), 42 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 1422327296..b46dbcab91 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -206,7 +206,6 @@ def read_files(directory, fine_resolution): 'kr_massnahmen_unterkategorien.csv', 'https://www.corona-datenplattform.de/dataset/massnahmen_unterkategorien_kreise') raise FileNotFoundError - # check if rows hospitals and geriatric care are still empty; # these fields have been empty so far and are thus not used test_codes = ['M23_010', 'M23_020', 'M23_030', 'M23_040', @@ -495,6 +494,10 @@ def get_npi_data(fine_resolution=2, except ValueError: pass + # if only one county is considered, it should be a list with one element + if not isinstance(counties_considered, list): + counties_considered = [counties_considered] + directory = out_folder directory = os.path.join(directory, 'Germany/') gd.check_dir(directory) @@ -874,16 +877,19 @@ def get_npi_data(fine_resolution=2, df_npis_old.replace([-99, 2, 3, 4, 5], [0, 1, 1, 1, 1], inplace=True) counter_cases_start = 0 - df_npis_multiple_codes = { - npi_groups_combinations_unique[i]: - [ - {df_npis_combinations_pre['Variablenname'][npi_groups_idx[i]].values[j]: - df_npis_combinations_pre['Massnahmenindex'][npi_groups_idx[i]].values[j] for j in range( - len(df_npis_combinations_pre['Variablenname'][npi_groups_idx[i]]))}, - np.zeros((len(npi_groups_idx[i]), len(npi_groups_idx[i])))] - for i in range(len(npi_groups_combinations_unique))} + # setup dataframe for each maingroup, same format as df_npi_combinations + df_count = df_npis_combinations.copy() + for code in df_count.keys(): + df_count[code][1] *= 0 + + # count_codes(df_npis_old, df_count.copy()) - count_codes(df_npis_old, df_npis_multiple_codes) + all_subcodes = [] + for maincode in df_npis_combinations.keys(): + all_subcodes += df_npis_combinations[maincode][1].columns.to_list() + if df_npis_combinations[maincode][1].columns.to_list() != list( + df_npis_combinations[maincode][0].keys()): + raise gd.DataError('Error') for countyID in counties_considered: cid = 0 @@ -929,8 +935,8 @@ def get_npi_data(fine_resolution=2, # Consistency of incidence dependent NPIs: # The same NPI should not be prescribed multiple times at the same day # for different thresholds. In order to avoid contradictions, only - # retain the strictest mentioned implementation. #! wording strictness - for i in range(int(len(df_local_old)/6)): # check if only same subcode #! maybe consider main code ? + # retain the strictest mentioned implementation. + for i in range(int(len(df_local_old)/6)): sum_npi_inc = np.where( df_local_old.iloc[6*i:6*(i+1), 6:].sum() > 1) if len(sum_npi_inc[0]): @@ -1028,10 +1034,8 @@ def get_npi_data(fine_resolution=2, # merge incidence dependent NPIs to have only one column for each subcode df_merged=df_local_new.iloc[:,:2].copy() - for code in df_npis_combinations.keys(): - for i in range(len(df_npis_combinations[code][1].columns)): - df_merged[df_npis_combinations[code][1].columns[i] - ] = df_local_new.iloc[:, 2+(i*6):2+(i*6)+6].sum(axis=1) + for subcode in all_subcodes: + df_merged[subcode] = df_local_new.filter(regex=subcode).sum(axis = 1) # strictness deactivation is done with this merged dataframe if df_merged.max()[2:].max() > 1: @@ -1051,17 +1055,14 @@ def get_npi_data(fine_resolution=2, # get code of corresponding NPI subcode = subcodes[idx_strictness] - # get subcode index - scidx = subcodes.index(subcode) - # get indices of days where subcode is active subcode_active = np.where(df_merged.loc[:,subcode]>0)[0] if len(subcode_active) > 0: # get indices of less strict NPIs - idxs_less_strict = df_npis_combinations[maincode][1].columns[np.sort(idx_strictness_sorted_rev[i+1:])] + codes_less_strict = df_npis_combinations[maincode][1].columns[np.sort(idx_strictness_sorted_rev[i+1:])] - subcodes_nocombi = df_npis_combinations[maincode][1].loc[scidx, :] + subcodes_nocombi = df_npis_combinations[maincode][1].loc[idx_strictness, :] # only consider those codes which cannot be combined; # for these, values of 1 have to be set to 0 subcodes_nocombi = list( @@ -1069,7 +1070,7 @@ def get_npi_data(fine_resolution=2, [subcodes_nocombi == 0].index) # intersect non-combinable subcodes with less strict subcodes - subcodes_deactivation = list(set(idxs_less_strict).intersection(subcodes_nocombi)) + subcodes_deactivation = np.sort(list(set(codes_less_strict).intersection(subcodes_nocombi))) for nocombi_code in subcodes_deactivation: days_deact = np.where(df_merged.loc[subcode_active, nocombi_code]>0)[0] @@ -1078,16 +1079,17 @@ def get_npi_data(fine_resolution=2, print('\t' + str(nocombi_code) + ' due to ' + str(subcode) + ' on ' + str(len(days_deact)) + ' days.') print('\n') df_merged.loc[subcode_active, nocombi_code] = 0 + df_count[maincode][1].loc[idx_strictness, + nocombi_code] += len(days_deact) # for fine resolution = 1 only consider merged dataframe if fine_resolution == 1: df_local_new = df_merged.copy() else: # multiply subcode columns with incidence dependent subcode columns in df_local_new - for maincode in df_npis_combinations.keys(): - for subcode in df_npis_combinations[maincode][1].columns: - for incidcode in ['','_1','_2','_3','_4','_5']: - df_local_new[subcode+incidcode]*=df_merged[subcode] + for subcode in all_subcodes: + for incidcode in ['','_1','_2','_3','_4','_5']: + df_local_new[subcode+incidcode]*=df_merged[subcode] counters[cid] += time.perf_counter()-start_time cid += 1 @@ -1164,45 +1166,58 @@ def get_npi_data(fine_resolution=2, else: filename = 'germany_counties_npi_maincat' gd.write_dataframe(df_npis, directory, filename, file_format) + gd.write_dataframe(df_count, directory, 'Exlusions', file_format) return df_npis -def count_codes(df_npis_old, df_npis_combinations): + +def count_codes(df_npis_old, df_npis_combinations, counties_considered): directory = os.path.join(dd.defaultDict['out_folder'], 'Germany/') - for county in geoger.get_county_ids(): - df_local = df_npis_old[df_npis_old[dd.EngEng['idCounty']]==county] + for county in counties_considered: + df_local = df_npis_old[df_npis_old[dd.EngEng['idCounty']] == county] code_dict = {} for code in df_npis_combinations.keys(): - for column in df_npis_combinations[code][1]: - code_dict[column] = df_local.iloc[:,6+np.where(df_local[df_local.NPI_code.str.contains(column)].iloc[:,6:].max() > 0)[0]].columns + for column in df_npis_combinations[code][1].columns: + code_dict[column] = df_local.iloc[:, 6+np.where( + df_local[df_local.NPI_code.str.contains(column)].iloc[:, 6:].max() > 0)[0]].columns for code in df_npis_combinations.keys(): column_list = df_npis_combinations[code][1].columns for column in range(len(column_list)): for column_other in range(len(column_list)): - df_npis_combinations[code][1].iloc[column, column_other] += len(set(code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) - df_npis_combinations[code][1].iloc[column_other, column] += len(set(code_dict[column_list[column_other]]).intersection(set(code_dict[column_list[column]]))) - - writer = pd.ExcelWriter(os.path.join(directory,'joined_codes.xlsx'), engine='xlsxwriter') + df_npis_combinations[code][1].iloc[column, column_other] += len(set( + code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) + df_npis_combinations[code][1].iloc[column_other, column] += len(set( + code_dict[column_list[column_other]]).intersection(set(code_dict[column_list[column]]))) + + writer = pd.ExcelWriter( + os.path.join(directory, 'joined_codes.xlsx'), + engine='xlsxwriter') for code in df_npis_combinations.keys(): df_npis_combinations[code][1].to_excel(writer, sheet_name=code) writer.close() - colors1 = np.array([[1.,1.,1.,1.]]) - colors2 = mpl.cm.get_cmap('cool')(np.linspace(0,1,255)) + colors1 = np.array([[1., 1., 1., 1.]]) + colors2 = mpl.cm.get_cmap('cool')(np.linspace(0, 1, 255)) colors = np.vstack((colors1, colors2)) cmap = mpl.colors.LinearSegmentedColormap.from_list('colormap', colors) for code in df_npis_combinations.keys(): - df = pd.read_excel(os.path.join(directory, 'joined_codes.xlsx'), sheet_name = code, engine='openpyxl') - array_exclusion = df.iloc[:,1:].to_numpy() + df = pd.read_excel( + os.path.join(directory, 'joined_codes.xlsx'), + sheet_name=code, engine='openpyxl') + array_exclusion = df.iloc[:, 1:].to_numpy() fig = plt.figure() positions = [i for i in range(len(df.columns)-1)] - plt.xticks(positions, [colname[-3:] for colname in df.columns.to_list()[1:]]) + plt.xticks(positions, [colname[-3:] + for colname in df.columns.to_list()[1:]]) plt.yticks(positions, df.columns.to_list()[1:]) - plt.imshow(array_exclusion, cmap=cmap, vmin = 0) + plt.imshow(array_exclusion, cmap=cmap, vmin=0) plt.colorbar() - plt.savefig(os.path.join(directory, 'heatmap_joined_codes', 'joined_codes_{}'.format(code))) + plt.savefig( + os.path.join( + directory, 'heatmap_joined_codes', 'joined_codes_{}'.format( + code))) plt.close() From 1447c363e433001f0bf9a89920b44c6f52dc9312 Mon Sep 17 00:00:00 2001 From: patricklnz Date: Tue, 4 Apr 2023 12:06:36 +0200 Subject: [PATCH 068/104] count multiple codes incidence dependent, fix combination matrix --- .../memilio/epidata/getNPIData.py | 34 ++++++++++++++----- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index b46dbcab91..9c0bc94a9d 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -20,6 +20,7 @@ from datetime import datetime, timedelta import time import os +import copy import pandas as pd import numpy as np import warnings @@ -598,7 +599,9 @@ def get_npi_data(fine_resolution=2, # run through all groups and set possible combinations according to # read combination matrix - start_comb_matrix = list(df_npis_combinations_pre.columns).index('Variablenname')+2 + # find begin of combination matrix + # there may be multiple columns named '0', so find first '1' column + start_comb_matrix = list(df_npis_combinations_pre.columns).index(1)-1 for i in range(len(npi_groups_idx)): codes_local = df_npis_combinations_pre.loc[npi_groups_idx[i], 'Variablenname'].values @@ -878,11 +881,14 @@ def get_npi_data(fine_resolution=2, counter_cases_start = 0 # setup dataframe for each maingroup, same format as df_npi_combinations - df_count = df_npis_combinations.copy() + df_count = copy.deepcopy(df_npis_combinations) for code in df_count.keys(): df_count[code][1] *= 0 - # count_codes(df_npis_old, df_count.copy()) + # create dataframe to count multiple codes after incidence dependent (de-)activation + df_count_incid_depend = pd.DataFrame() + + #count_codes(df_npis_old, df_count.copy(), counties_considered) all_subcodes = [] for maincode in df_npis_combinations.keys(): @@ -1031,6 +1037,8 @@ def get_npi_data(fine_resolution=2, # with the respective value in int_active df_local_new.iloc[:, npis_idx_start + np.array(npi_indices)] \ = df_local_new.iloc[:, npis_idx_start + np.array(npi_indices)].mul(int_active, axis=0) + + df_count_incid_depend = pd.concat([df_count_incid_depend, df_local_new.copy()]) # merge incidence dependent NPIs to have only one column for each subcode df_merged=df_local_new.iloc[:,:2].copy() @@ -1075,7 +1083,7 @@ def get_npi_data(fine_resolution=2, for nocombi_code in subcodes_deactivation: days_deact = np.where(df_merged.loc[subcode_active, nocombi_code]>0)[0] if len(days_deact) > 0: - print('Deactivating for ' + 'County ' + str(countyID)) + print('Deactivating for County ' + str(countyID)) print('\t' + str(nocombi_code) + ' due to ' + str(subcode) + ' on ' + str(len(days_deact)) + ' days.') print('\n') df_merged.loc[subcode_active, nocombi_code] = 0 @@ -1166,7 +1174,7 @@ def get_npi_data(fine_resolution=2, else: filename = 'germany_counties_npi_maincat' gd.write_dataframe(df_npis, directory, filename, file_format) - gd.write_dataframe(df_count, directory, 'Exlusions', file_format) + gd.write_dataframe(df_count_incid_depend, directory, 'joined_codes_incid_dependent', file_format) return df_npis @@ -1181,10 +1189,20 @@ def count_codes(df_npis_old, df_npis_combinations, counties_considered): code_dict[column] = df_local.iloc[:, 6+np.where( df_local[df_local.NPI_code.str.contains(column)].iloc[:, 6:].max() > 0)[0]].columns + #for code in df_npis_combinations.keys(): + # column_list = df_npis_combinations[code][1].columns + # for column in range(len(column_list)): + # for column_other in range(len(column_list)): + # df_npis_combinations[code][1].iloc[column, column_other] += len(set( + # code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) + # df_npis_combinations[code][1].iloc[column_other, column] += len(set( + # code_dict[column_list[column_other]]).intersection(set(code_dict[column_list[column]]))) + + # no diag for code in df_npis_combinations.keys(): column_list = df_npis_combinations[code][1].columns for column in range(len(column_list)): - for column_other in range(len(column_list)): + for column_other in range(column): df_npis_combinations[code][1].iloc[column, column_other] += len(set( code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) df_npis_combinations[code][1].iloc[column_other, column] += len(set( @@ -1198,7 +1216,7 @@ def count_codes(df_npis_old, df_npis_combinations, counties_considered): writer.close() colors1 = np.array([[1., 1., 1., 1.]]) - colors2 = mpl.cm.get_cmap('cool')(np.linspace(0, 1, 255)) + colors2 = mpl.cm.get_cmap('cool')(np.linspace(0, 1, 100000)) colors = np.vstack((colors1, colors2)) cmap = mpl.colors.LinearSegmentedColormap.from_list('colormap', colors) @@ -1216,7 +1234,7 @@ def count_codes(df_npis_old, df_npis_combinations, counties_considered): plt.colorbar() plt.savefig( os.path.join( - directory, 'heatmap_joined_codes', 'joined_codes_{}'.format( + directory, 'heatmaps_joined_codes', 'joined_codes_{}'.format( code))) plt.close() From 2ac18889873df55b7449c79a71d58cfe0178296f Mon Sep 17 00:00:00 2001 From: Anna Wendler Date: Tue, 4 Apr 2023 13:32:53 +0200 Subject: [PATCH 069/104] Adjust plotting --- .../memilio/epidata/getNPIData.py | 146 +++++++++++------- 1 file changed, 88 insertions(+), 58 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index b46dbcab91..f1c639ae1c 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -17,9 +17,14 @@ # See the License for the specific language governing permissions and # limitations under the License. ############################################################################# +from memilio.epidata import getPopulationData as gpd +from memilio.epidata import defaultDict as dd +from memilio.epidata import geoModificationGermany as geoger +from memilio.epidata import getDataIntoPandasDataFrame as gd from datetime import datetime, timedelta import time import os +import copy import pandas as pd import numpy as np import warnings @@ -28,11 +33,6 @@ warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning) -from memilio.epidata import getDataIntoPandasDataFrame as gd -from memilio.epidata import geoModificationGermany as geoger -from memilio.epidata import defaultDict as dd -from memilio.epidata import getPopulationData as gpd - def validate(df_npis_old, df_npis, df_infec_rki, countyID, npiCode, start_npi_cols, npi_incid_start, start_date_validation, @@ -532,16 +532,16 @@ def get_npi_data(fine_resolution=2, # NPIs of different main categories (e.g., M01a and M04) can always be # prescribed together as they target different locations and settings. # - # NPIs with the same main code (i.e., targeting the same location, e.g., + # NPIs with the same main code (i.e., targeting the same location, e.g., # schools, or the same set of NPIs, e.g., masks) can exclude each other. # Exclusion happens based on table provided in xlsx or csv format. - # + # # In first place, NPIs of higher stringency index as defined by the Corona- - # Datenplattform deactivate NPIs with lower stringency index. + # Datenplattform deactivate NPIs with lower stringency index. # NPIs of the same main code and with the same stringency index may or # may not exclude each other according to the threshold they were # prescribed with. Prescribed and active NPIs with high incidence thresholds - # deactivate conflicting NPIs with lower thresholds (as the latter are + # deactivate conflicting NPIs with lower thresholds (as the latter are # considered to be less strict). if fine_resolution > 0: num_nonexistent_codes_pre = df_npis_combinations_pre['Variablenname'].str.count( @@ -549,15 +549,17 @@ def get_npi_data(fine_resolution=2, if num_nonexistent_codes_pre != 0: df_npis_combinations_pre = df_npis_combinations_pre.iloc[: - num_nonexistent_codes_pre, :] - + # drop 0 column if existent try: df_npis_combinations_pre.drop(columns=0) except KeyError: pass # rename essential columns and throw away others - columns_combinations = np.where((df_npis_combinations_pre=='x').any()==True)[0] #maybe rename columns_used ? - column_names = ['Unnamed: ' + str(i) for i in range(columns_combinations[0], columns_combinations[-1]+1)] + columns_combinations = np.where((df_npis_combinations_pre == 'x').any() == True)[ + 0] # maybe rename columns_used ? + column_names = [ + 'Unnamed: ' + str(i) for i in range(columns_combinations[0], columns_combinations[-1]+1)] rename_columns = {column_names[i]: i for i in range(len(column_names))} df_npis_combinations_pre.rename(columns=rename_columns, inplace=True) df_npis_combinations_pre = df_npis_combinations_pre[[ @@ -589,16 +591,16 @@ def get_npi_data(fine_resolution=2, df_npis_combinations = { npi_groups_combinations_unique[i]: [ - {df_npis_combinations_pre['Variablenname'][npi_groups_idx[i]].values[j]: - df_npis_combinations_pre['Massnahmenindex'][npi_groups_idx[i]].values[j] for j in range( - len(df_npis_combinations_pre['Variablenname'][npi_groups_idx[i]]))}, + {df_npis_combinations_pre['Variablenname'][npi_groups_idx[i]].values[j]: + df_npis_combinations_pre['Massnahmenindex'][npi_groups_idx[i]].values[j] for j in range( + len(df_npis_combinations_pre['Variablenname'][npi_groups_idx[i]]))}, np.eye(len(npi_groups_idx[i]))] for i in range(len(npi_groups_combinations_unique))} - # run through all groups and set possible combinations according to # read combination matrix - start_comb_matrix = list(df_npis_combinations_pre.columns).index('Variablenname')+2 + start_comb_matrix = list( + df_npis_combinations_pre.columns).index('Variablenname')+2 for i in range(len(npi_groups_idx)): codes_local = df_npis_combinations_pre.loc[npi_groups_idx[i], 'Variablenname'].values @@ -613,7 +615,7 @@ def get_npi_data(fine_resolution=2, df_npis_combinations[npic_uniq][1], columns=codes_local) df_npis_combinations[npic_uniq][1].insert( - 0, 'Code', codes_local) + 0, 'Code', codes_local) # use to_excel function and specify the sheet_name and index # to store the dataframe in specified sheet if file not yet existent @@ -714,9 +716,7 @@ def get_npi_data(fine_resolution=2, if key in npis['NPI_code'].values} # remove columns of combinations df_npis_combinations[code][1] = df_npis_combinations[code][1].loc[local_codes_used_rows, - local_codes_used_cols].reset_index(drop=True).copy() - - + local_codes_used_cols].reset_index(drop=True).copy() # prepare grouping of NPIs to reduce product space of # NPI x active_from_inc (with values "incidence does not matter", and @@ -882,7 +882,9 @@ def get_npi_data(fine_resolution=2, for code in df_count.keys(): df_count[code][1] *= 0 - # count_codes(df_npis_old, df_count.copy()) + count_codes(df_npis_old, df_count.copy(), + counties_considered=counties_considered) + plot_counter('joined_codes') all_subcodes = [] for maincode in df_npis_combinations.keys(): @@ -953,7 +955,7 @@ def get_npi_data(fine_resolution=2, if not all( df_local_old.iloc - [6 * i : 6 * (i + 1), + [6 * i: 6 * (i + 1), 6 + sum_npi_inc[0]].sum() == 1): raise gd.DataError('Consistency correction failed.') @@ -1033,9 +1035,10 @@ def get_npi_data(fine_resolution=2, = df_local_new.iloc[:, npis_idx_start + np.array(npi_indices)].mul(int_active, axis=0) # merge incidence dependent NPIs to have only one column for each subcode - df_merged=df_local_new.iloc[:,:2].copy() + df_merged = df_local_new.iloc[:, :2].copy() for subcode in all_subcodes: - df_merged[subcode] = df_local_new.filter(regex=subcode).sum(axis = 1) + df_merged[subcode] = df_local_new.filter( + regex=subcode).sum(axis=1) # strictness deactivation is done with this merged dataframe if df_merged.max()[2:].max() > 1: @@ -1056,27 +1059,32 @@ def get_npi_data(fine_resolution=2, subcode = subcodes[idx_strictness] # get indices of days where subcode is active - subcode_active = np.where(df_merged.loc[:,subcode]>0)[0] + subcode_active = np.where(df_merged.loc[:, subcode] > 0)[0] - if len(subcode_active) > 0: + if len(subcode_active) > 0: # get indices of less strict NPIs - codes_less_strict = df_npis_combinations[maincode][1].columns[np.sort(idx_strictness_sorted_rev[i+1:])] + codes_less_strict = df_npis_combinations[maincode][1].columns[np.sort( + idx_strictness_sorted_rev[i+1:])] subcodes_nocombi = df_npis_combinations[maincode][1].loc[idx_strictness, :] - # only consider those codes which cannot be combined; + # only consider those codes which cannot be combined; # for these, values of 1 have to be set to 0 subcodes_nocombi = list( subcodes_nocombi [subcodes_nocombi == 0].index) - + # intersect non-combinable subcodes with less strict subcodes - subcodes_deactivation = np.sort(list(set(codes_less_strict).intersection(subcodes_nocombi))) + subcodes_deactivation = np.sort( + list(set(codes_less_strict).intersection(subcodes_nocombi))) for nocombi_code in subcodes_deactivation: - days_deact = np.where(df_merged.loc[subcode_active, nocombi_code]>0)[0] + days_deact = np.where( + df_merged.loc[subcode_active, nocombi_code] > 0)[0] if len(days_deact) > 0: - print('Deactivating for ' + 'County ' + str(countyID)) - print('\t' + str(nocombi_code) + ' due to ' + str(subcode) + ' on ' + str(len(days_deact)) + ' days.') + print('Deactivating for ' + + 'County ' + str(countyID)) + print('\t' + str(nocombi_code) + ' due to ' + + str(subcode) + ' on ' + str(len(days_deact)) + ' days.') print('\n') df_merged.loc[subcode_active, nocombi_code] = 0 df_count[maincode][1].loc[idx_strictness, @@ -1088,8 +1096,8 @@ def get_npi_data(fine_resolution=2, else: # multiply subcode columns with incidence dependent subcode columns in df_local_new for subcode in all_subcodes: - for incidcode in ['','_1','_2','_3','_4','_5']: - df_local_new[subcode+incidcode]*=df_merged[subcode] + for incidcode in ['', '_1', '_2', '_3', '_4', '_5']: + df_local_new[subcode+incidcode] *= df_merged[subcode] counters[cid] += time.perf_counter()-start_time cid += 1 @@ -1166,12 +1174,13 @@ def get_npi_data(fine_resolution=2, else: filename = 'germany_counties_npi_maincat' gd.write_dataframe(df_npis, directory, filename, file_format) - gd.write_dataframe(df_count, directory, 'Exlusions', file_format) + gd.write_dataframe(pd.DataFrame(df_count), directory, + 'Exclusions', file_format) return df_npis -def count_codes(df_npis_old, df_npis_combinations, counties_considered): +def count_codes(df_npis_old, df_count, counties_considered): directory = os.path.join(dd.defaultDict['out_folder'], 'Germany/') for county in counties_considered: df_local = df_npis_old[df_npis_old[dd.EngEng['idCounty']] == county] @@ -1181,30 +1190,51 @@ def count_codes(df_npis_old, df_npis_combinations, counties_considered): code_dict[column] = df_local.iloc[:, 6+np.where( df_local[df_local.NPI_code.str.contains(column)].iloc[:, 6:].max() > 0)[0]].columns - for code in df_npis_combinations.keys(): - column_list = df_npis_combinations[code][1].columns + # with diag + for code in df_count.keys(): + column_list = df_count[code][1].columns for column in range(len(column_list)): for column_other in range(len(column_list)): - df_npis_combinations[code][1].iloc[column, column_other] += len(set( + df_count[code][1].iloc[column, column_other] += len(set( code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) - df_npis_combinations[code][1].iloc[column_other, column] += len(set( - code_dict[column_list[column_other]]).intersection(set(code_dict[column_list[column]]))) + # no diag + # for code in df_npis_combinations.keys(): + # column_list = df_npis_combinations[code][1].columns + # for column in range(len(column_list)): + # for column_other in range(column): + # df_npis_combinations[code][1].iloc[column, column_other] += len(set( + # code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) + # df_npis_combinations[code][1].iloc[column_other, column] += len(set( + # code_dict[column_list[column_other]]).intersection(set(code_dict[column_list[column]]))) + + # save results + directory = os.path.join(dd.defaultDict['out_folder'], 'Germany/') writer = pd.ExcelWriter( - os.path.join(directory, 'joined_codes.xlsx'), + os.path.join(directory, filename + '.xlsx'), engine='xlsxwriter') - for code in df_npis_combinations.keys(): - df_npis_combinations[code][1].to_excel(writer, sheet_name=code) + for code in df_count.keys(): + df_count[code][1].to_excel(writer, sheet_name=code) writer.close() - colors1 = np.array([[1., 1., 1., 1.]]) - colors2 = mpl.cm.get_cmap('cool')(np.linspace(0, 1, 255)) - colors = np.vstack((colors1, colors2)) - cmap = mpl.colors.LinearSegmentedColormap.from_list('colormap', colors) +# saves plot in folder directory/heatmap_filename + - for code in df_npis_combinations.keys(): +def plot_counter(filename): + directory = os.path.join(dd.defaultDict['out_folder'], 'Germany/') + target_directory = os.path.join(directory, 'heatmap_' + filename) + if not os.path.exists(target_directory): + os.makedirs(target_directory) + + codelist = pd.ExcelFile(os.path.join( + directory, filename + '.xlsx'), engine='openpyxl').sheet_names + + cmap = copy.copy(mpl.cm.get_cmap('cool')) + cmap.set_under('white') + + for code in codelist: df = pd.read_excel( - os.path.join(directory, 'joined_codes.xlsx'), + os.path.join(directory, filename + '.xlsx'), sheet_name=code, engine='openpyxl') array_exclusion = df.iloc[:, 1:].to_numpy() fig = plt.figure() @@ -1212,17 +1242,17 @@ def count_codes(df_npis_old, df_npis_combinations, counties_considered): plt.xticks(positions, [colname[-3:] for colname in df.columns.to_list()[1:]]) plt.yticks(positions, df.columns.to_list()[1:]) - plt.imshow(array_exclusion, cmap=cmap, vmin=0) + # set vmin =1 so that only combinations are of interest are in colour, else white + # set vmax = 300000, this should be larger than maxima in all dataframes, + # this way colors of heatmaps are comparable (e.g. between codes or between joined_codes and exclusions) + plt.imshow(array_exclusion, cmap=cmap, vmin=1, vmax=300000) plt.colorbar() plt.savefig( - os.path.join( - directory, 'heatmap_joined_codes', 'joined_codes_{}'.format( - code))) + os.path.join(target_directory, filename + '_{}'.format( + code))) plt.close() - - def main(): """! Main program entry.""" From 9c614d149d95e1442ba1ff7ca8eb65d1693116d8 Mon Sep 17 00:00:00 2001 From: Anna Wendler Date: Tue, 4 Apr 2023 17:55:18 +0200 Subject: [PATCH 070/104] fix last commit --- .../memilio/epidata/getNPIData.py | 85 ++++++++++--------- 1 file changed, 44 insertions(+), 41 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 35008099e9..240b8915c7 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -263,7 +263,7 @@ def read_files(directory, fine_resolution): def activate_npis_based_on_incidence( local_incid, npi_lifting_days_threshold, npi_activation_days_threshold, incid_threshold): - """! + """! Computes an activation vector according to a given incidence threshold, observed incidence and activation or lifting delays. @@ -272,15 +272,15 @@ def activate_npis_based_on_incidence( For a formerly active NPI to be lifted, the incidence has to be below the threshold for npi_lifting_days_threshold many days. - If one of the former cases holds true, then the activation or lifting happens - two days after the satisfaction of the criterion. This is in accordance with - case reporting that can only happen after the day has finished and as these - reports generally appeared in the morning for the previous day, the NPI can - not directly be activated or lifted that day but only on the next day. Hence + If one of the former cases holds true, then the activation or lifting happens + two days after the satisfaction of the criterion. This is in accordance with + case reporting that can only happen after the day has finished and as these + reports generally appeared in the morning for the previous day, the NPI can + not directly be activated or lifted that day but only on the next day. Hence the incidence-dependent NPI is activated or lifted two days after the threshold is/ is not exceeded. Please see the examples for a better understanding. - Example (Threshold=3.5): + Example (Threshold=3.5): local_incid=pd.Series([2, 4, 2, 4, 2, 2, 4, 4, 2, 4, 2, 2, 2, 2]) Yesterdays incidence is over the threshold on following days: [?, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0] @@ -299,7 +299,7 @@ def activate_npis_based_on_incidence( NPI should be activated on day 9 (and lifted on day 15; not in the vector) [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] - Another example: + Another example: With yesterday's incidence over threshold on days: [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0] npi_lifting_days_threshold=3, npi_activation_days_threshold=1 @@ -307,7 +307,7 @@ def activate_npis_based_on_incidence( int_active should then be: [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0] - Please also note that the first column will always returned as false + Please also note that the first column will always returned as false so the dataframe should not start with dates where NPIs are implemented. For the Corona Datenplattform frame which starts from 2020-03-01 this is no problem for the first days as there were no NPIs. @@ -352,7 +352,7 @@ def activate_npis_based_on_incidence( def drop_codes_and_categories( npi_codes_prior, npi_codes_prior_desc, df_npis_old, fine_resolution): - """! Drops codes and categories from original data frame if they are not + """! Drops codes and categories from original data frame if they are not used. @param npi_codes_prior NPI codes read from description sheet. @@ -447,7 +447,7 @@ def get_npi_data(fine_resolution=2, -fine_resolution=1: germany_counties_npi_subcat_incgrouped -fine_resolution=0: germany_counties_npi_maincat - Needs the files 'cases_all_county_all_dates_repdate.json' and + Needs the files 'cases_all_county_all_dates_repdate.json' and 'county_current_population.json' which can be created by the functions getCasesData.py (with argument --rep-date) and getPopulationData.py. @@ -478,7 +478,7 @@ def get_npi_data(fine_resolution=2, county IDs from 1001 to 16xxx. @param npi_activation_days_threshold [Default: 3]. Defines necessary number of days exceeding case incidence threshold to activate NPIs. - @param npi_alifting_days_threshold [Default: 5]. Defines necessary number + @param npi_alifting_days_threshold [Default: 5]. Defines necessary number of days below case incidence threshold threshold to lift NPIs. """ @@ -879,15 +879,24 @@ def get_npi_data(fine_resolution=2, counter_cases_start = 0 # setup dataframe for each maingroup, same format as df_npi_combinations - df_count = copy.deepcopy(df_npis_combinations) - for code in df_count.keys(): - df_count[code][1] *= 0 + # used + # used to count number of codes that are deactivated + df_count_deactivation = copy.deepcopy(df_npis_combinations) + for code in df_count_deactivation.keys(): + df_count_deactivation[code][1] *= 0 # create dataframe to count multiple codes after incidence dependent (de-)activation df_count_incid_depend = pd.DataFrame() - # count_codes(df_npis_old, df_count_incid_depend, counties_considered, - # counties_considered=counties_considered) + # setup dataframe for each maingroup, same format as df_npi_combinations + # used + # used to count codes that occur simultaneously now (before any (de-)activation) + # df_count_joined_codes = copy.deepcopy(df_npis_combinations) + # for code in df_count_joined_codes.keys(): + # df_count_joined_codes[code][1] *= 0 + # df_counted_joined_codes = count_codes(df_npis_old, df_count_joined_codes, + # counties_considered=counties_considered) + # save_counter(df_counted_joined_codes, 'joined_codes') # plot_counter('joined_codes') all_subcodes = [] @@ -1094,8 +1103,8 @@ def get_npi_data(fine_resolution=2, str(subcode) + ' on ' + str(len(days_deact)) + ' days.') print('\n') df_merged.loc[subcode_active, nocombi_code] = 0 - df_count[maincode][1].loc[idx_strictness, - nocombi_code] += len(days_deact) + df_count_deactivation[maincode][1].loc[idx_strictness, + nocombi_code] += len(days_deact) # for fine resolution = 1 only consider merged dataframe if fine_resolution == 1: @@ -1106,6 +1115,8 @@ def get_npi_data(fine_resolution=2, for incidcode in ['', '_1', '_2', '_3', '_4', '_5']: df_local_new[subcode+incidcode] *= df_merged[subcode] + save_counter(df_count_deactivation, 'count_deactivation') + counters[cid] += time.perf_counter()-start_time cid += 1 ### ### @@ -1188,41 +1199,33 @@ def get_npi_data(fine_resolution=2, def count_codes(df_npis_old, df_count, counties_considered): - directory = os.path.join(dd.defaultDict['out_folder'], 'Germany/') for county in counties_considered: df_local = df_npis_old[df_npis_old[dd.EngEng['idCounty']] == county] code_dict = {} - for code in df_npis_combinations.keys(): - for column in df_npis_combinations[code][1].columns: + for code in df_count.keys(): + for column in df_count[code][1].columns: code_dict[column] = df_local.iloc[:, 6+np.where( df_local[df_local.NPI_code.str.contains(column)].iloc[:, 6:].max() > 0)[0]].columns - # for code in df_npis_combinations.keys(): - # column_list = df_npis_combinations[code][1].columns + # with diag + # for code in df_count.keys(): + # column_list = df_count[code][1].columns # for column in range(len(column_list)): # for column_other in range(len(column_list)): - # df_npis_combinations[code][1].iloc[column, column_other] += len(set( + # df_count[code][1].iloc[column, column_other] += len(set( # code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) - # df_npis_combinations[code][1].iloc[column_other, column] += len(set( - # code_dict[column_list[column_other]]).intersection(set(code_dict[column_list[column]]))) # no diag - for code in df_npis_combinations.keys(): - column_list = df_npis_combinations[code][1].columns + for code in df_count.keys(): + column_list = df_count[code][1].columns for column in range(len(column_list)): for column_other in range(column): - df_npis_combinations[code][1].iloc[column, column_other] += len(set( + df_count[code][1].iloc[column, column_other] += len(set( code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) - # no diag - # for code in df_npis_combinations.keys(): - # column_list = df_npis_combinations[code][1].columns - # for column in range(len(column_list)): - # for column_other in range(column): - # df_npis_combinations[code][1].iloc[column, column_other] += len(set( - # code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) - # df_npis_combinations[code][1].iloc[column_other, column] += len(set( - # code_dict[column_list[column_other]]).intersection(set(code_dict[column_list[column]]))) + return df_count + +def save_counter(df_count, filename): # save results directory = os.path.join(dd.defaultDict['out_folder'], 'Germany/') @@ -1258,9 +1261,9 @@ def plot_counter(filename): plt.xticks(positions, [colname[-3:] for colname in df.columns.to_list()[1:]]) plt.yticks(positions, df.columns.to_list()[1:]) - # set vmin =1 so that only combinations are of interest are in colour, else white + # set vmin = 1 so that only combinations that are of interest are in colour, else white # set vmax = 300000, this should be larger than maxima in all dataframes, - # this way colors of heatmaps are comparable (e.g. between codes or between joined_codes and exclusions) + # this way colours of heatmaps are comparable (e.g. between codes or between joined_codes and exclusions) plt.imshow(array_exclusion, cmap=cmap, vmin=1, vmax=300000) plt.colorbar() plt.savefig( From 5f9f6451378455c09f7f067bf51f676f05fd28de Mon Sep 17 00:00:00 2001 From: Anna Wendler Date: Thu, 6 Apr 2023 17:07:44 +0200 Subject: [PATCH 071/104] count joined codes for incid_depend --- .../memilio/epidata/getNPIData.py | 109 ++++++++++++------ 1 file changed, 73 insertions(+), 36 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 240b8915c7..7d97f8f90e 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -172,9 +172,9 @@ def read_files(directory, fine_resolution): dummy_to_append = pd.DataFrame( columns=['code'] + dates_new, - data=df_npis_per_code - [df_npis_per_code.ags5 == counties[i]]. - iloc[:, 6:].T.reset_index().values.copy()) + data=copy.deepcopy(df_npis_per_code + [df_npis_per_code.ags5 == counties[i]]. + iloc[:, 6:].T.reset_index().values)) df_local[i] = pd.concat([df_local[i], dummy_to_append]) @@ -631,8 +631,8 @@ def get_npi_data(fine_resolution=2, for i in range(len(npi_groups_combinations_unique)): codes_local = df_npis_combinations[npi_groups_combinations_unique[i] ][1].columns[1:] - df_out = df_npis_combinations[npi_groups_combinations_unique[i]][ - 1].copy() + df_out = copy.deepcopy(df_npis_combinations[npi_groups_combinations_unique[i]][ + 1]) df_out.insert( 0, 'Description (German)', [desc @@ -878,27 +878,28 @@ def get_npi_data(fine_resolution=2, df_npis_old.replace([-99, 2, 3, 4, 5], [0, 1, 1, 1, 1], inplace=True) counter_cases_start = 0 - # setup dataframe for each maingroup, same format as df_npi_combinations - # used - # used to count number of codes that are deactivated - df_count_deactivation = copy.deepcopy(df_npis_combinations) - for code in df_count_deactivation.keys(): - df_count_deactivation[code][1] *= 0 - - # create dataframe to count multiple codes after incidence dependent (de-)activation - df_count_incid_depend = pd.DataFrame() - - # setup dataframe for each maingroup, same format as df_npi_combinations - # used - # used to count codes that occur simultaneously now (before any (de-)activation) + # # setup dataframe for each maingroup, same format as df_npi_combinations + # # used to count codes that occur simultaneously now (before any (de-)activation) # df_count_joined_codes = copy.deepcopy(df_npis_combinations) - # for code in df_count_joined_codes.keys(): - # df_count_joined_codes[code][1] *= 0 + # for subcode in df_count_joined_codes.keys(): + # df_count_joined_codes[subcode][1] *= 0 # df_counted_joined_codes = count_codes(df_npis_old, df_count_joined_codes, # counties_considered=counties_considered) # save_counter(df_counted_joined_codes, 'joined_codes') # plot_counter('joined_codes') + # create dataframe to count multiple codes after incidence dependent (de-)activation + df_incid_depend = pd.DataFrame() + df_count_incid_depend = copy.deepcopy(df_npis_combinations) + for maincode in df_count_incid_depend.keys(): + df_count_incid_depend[maincode][1] *= 0 + + # setup dataframe for each maingroup, same format as df_npi_combinations + # used to count number of codes that are deactivated + df_count_deactivation = copy.deepcopy(df_npis_combinations) + for maincode in df_count_deactivation.keys(): + df_count_deactivation[maincode][1] *= 0 + all_subcodes = [] for maincode in df_npis_combinations.keys(): all_subcodes += df_npis_combinations[maincode][1].columns.to_list() @@ -912,8 +913,8 @@ def get_npi_data(fine_resolution=2, if fine_resolution > 0: # compute incidence based on previous data frames - df_infec_local = df_infec_rki[df_infec_rki[dd.EngEng['idCounty']] == countyID].copy( - ) + df_infec_local = copy.deepcopy( + df_infec_rki[df_infec_rki[dd.EngEng['idCounty']] == countyID]) pop_local = df_population.loc[df_population[dd.EngEng['idCounty']] == countyID, dd.EngEng['population']].values[0] @@ -936,7 +937,7 @@ def get_npi_data(fine_resolution=2, df_infec_local = df_infec_local[(df_infec_local[dd.EngEng['date']] >= start_date_new) & ( df_infec_local[dd.EngEng['date']] <= end_date_new)].reset_index() - local_incid = df_infec_local['Incidence'].copy() + local_incid = copy.deepcopy(df_infec_local['Incidence']) # Count counties with start cases >= 1: # In this case NPI activation cannot be ensured to work as expected if cases_first_value >= 1: @@ -944,8 +945,8 @@ def get_npi_data(fine_resolution=2, # get county-local data frame start_time = time.perf_counter() - df_local_old = df_npis_old[df_npis_old[dd.EngEng['idCounty']] - == countyID].copy() + df_local_old = copy.deepcopy(df_npis_old[df_npis_old[dd.EngEng['idCounty']] + == countyID]) # Consistency of incidence dependent NPIs: # The same NPI should not be prescribed multiple times at the same day @@ -989,8 +990,8 @@ def get_npi_data(fine_resolution=2, # old dataframe has npi codes as columns and date values as rows # new dataframe should be transposed - df_local_new = df_local_old.iloc[npi_rows, start_npi_cols-1:].set_index( - dd.EngEng['npiCode']).transpose().copy() + df_local_new = copy.deepcopy(df_local_old.iloc[npi_rows, start_npi_cols-1:].set_index( + dd.EngEng['npiCode']).transpose()) # get datetime as a column (previously index after transposing) df_local_new = df_local_new.reset_index( drop=False).rename( @@ -1047,9 +1048,6 @@ def get_npi_data(fine_resolution=2, df_local_new.iloc[:, npis_idx_start + np.array(npi_indices)] \ = df_local_new.iloc[:, npis_idx_start + np.array(npi_indices)].mul(int_active, axis=0) - df_count_incid_depend = pd.concat( - [df_count_incid_depend, df_local_new.copy()]) - # merge incidence dependent NPIs to have only one column for each subcode df_merged = df_local_new.iloc[:, :2].copy() for subcode in all_subcodes: @@ -1057,6 +1055,9 @@ def get_npi_data(fine_resolution=2, regex=subcode).sum(axis=1) # strictness deactivation is done with this merged dataframe + df_incid_depend = pd.concat( + [df_incid_depend, copy.deepcopy(df_merged)]) + if df_merged.max()[2:].max() > 1: raise gd.DataError('Error in merging...') @@ -1116,6 +1117,7 @@ def get_npi_data(fine_resolution=2, df_local_new[subcode+incidcode] *= df_merged[subcode] save_counter(df_count_deactivation, 'count_deactivation') + plot_counter('count_deactivation') counters[cid] += time.perf_counter()-start_time cid += 1 @@ -1150,6 +1152,12 @@ def get_npi_data(fine_resolution=2, 'Please consider a start date of some weeks ahead of the ' 'time window to be analyzed for NPI\'s effects.') + # count joined codes from after incidence based activation + count_codes_incid_depend( + df_incid_depend, df_count_incid_depend, counties_considered) + save_counter(df_count_incid_depend, 'joined_codes_incid_depend') + plot_counter('joined_codes_incid_depend') + # print sub counters print('Sub task counters are: ') print(counters) @@ -1202,8 +1210,8 @@ def count_codes(df_npis_old, df_count, counties_considered): for county in counties_considered: df_local = df_npis_old[df_npis_old[dd.EngEng['idCounty']] == county] code_dict = {} - for code in df_count.keys(): - for column in df_count[code][1].columns: + for maincode in df_count.keys(): + for column in df_count[maincode][1].columns: code_dict[column] = df_local.iloc[:, 6+np.where( df_local[df_local.NPI_code.str.contains(column)].iloc[:, 6:].max() > 0)[0]].columns @@ -1216,15 +1224,43 @@ def count_codes(df_npis_old, df_count, counties_considered): # code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) # no diag - for code in df_count.keys(): - column_list = df_count[code][1].columns + for maincode in df_count.keys(): + column_list = df_count[maincode][1].columns for column in range(len(column_list)): for column_other in range(column): - df_count[code][1].iloc[column, column_other] += len(set( + df_count[maincode][1].iloc[column, column_other] += len(set( code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) return df_count +def count_codes_incid_depend(df_incid_depend, df_count_incid_depend, counties_considered): + for county in counties_considered: + df_local = df_incid_depend[df_incid_depend[dd.EngEng['idCounty']] == county] + code_dict = {} + for maincode in df_count_incid_depend.keys(): + for column in df_count_incid_depend[maincode][1].columns: + code_dict[column] = df_local.iloc[np.where( + df_local.loc[:, df_local.columns.str.contains(column)].max(axis=1) > 0)[0], 0].to_list() + + # with diag + # for maincode in df_count_incid_depend.keys(): + # column_list = df_count_incid_depend[maincode][1].columns + # for column in range(len(column_list)): + # for column_other in range(len(column_list)): + # df_count_incid_depend[maincode][1].iloc[column, column_other] += len(set( + # code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) + + # no diag + for maincode in df_count_incid_depend.keys(): + column_list = df_count_incid_depend[maincode][1].columns + for column in range(len(column_list)): + for column_other in range(column): + df_count_incid_depend[maincode][1].iloc[column, column_other] += len(set( + code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) + + return df_count_incid_depend + + def save_counter(df_count, filename): # save results directory = os.path.join(dd.defaultDict['out_folder'], 'Germany/') @@ -1261,7 +1297,8 @@ def plot_counter(filename): plt.xticks(positions, [colname[-3:] for colname in df.columns.to_list()[1:]]) plt.yticks(positions, df.columns.to_list()[1:]) - # set vmin = 1 so that only combinations that are of interest are in colour, else white + # set vmin = 1 so that only combinations that are simultaneously active at least on one day are in colour, + # else white # set vmax = 300000, this should be larger than maxima in all dataframes, # this way colours of heatmaps are comparable (e.g. between codes or between joined_codes and exclusions) plt.imshow(array_exclusion, cmap=cmap, vmin=1, vmax=300000) From c61f62a304c87bc9054c7f44e35303dbb513590f Mon Sep 17 00:00:00 2001 From: patricklnz Date: Tue, 18 Apr 2023 10:26:30 +0200 Subject: [PATCH 072/104] fix OSError --- .../memilio/epidata/getNPIData.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 7d97f8f90e..6de9ffd41f 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -885,8 +885,8 @@ def get_npi_data(fine_resolution=2, # df_count_joined_codes[subcode][1] *= 0 # df_counted_joined_codes = count_codes(df_npis_old, df_count_joined_codes, # counties_considered=counties_considered) - # save_counter(df_counted_joined_codes, 'joined_codes') - # plot_counter('joined_codes') + # save_counter(df_counted_joined_codes, 'joined_codes', directory) + # plot_counter('joined_codes', directory) # create dataframe to count multiple codes after incidence dependent (de-)activation df_incid_depend = pd.DataFrame() @@ -1116,8 +1116,8 @@ def get_npi_data(fine_resolution=2, for incidcode in ['', '_1', '_2', '_3', '_4', '_5']: df_local_new[subcode+incidcode] *= df_merged[subcode] - save_counter(df_count_deactivation, 'count_deactivation') - plot_counter('count_deactivation') + save_counter(df_count_deactivation, 'count_deactivation', directory) + plot_counter('count_deactivation', directory) counters[cid] += time.perf_counter()-start_time cid += 1 @@ -1155,8 +1155,8 @@ def get_npi_data(fine_resolution=2, # count joined codes from after incidence based activation count_codes_incid_depend( df_incid_depend, df_count_incid_depend, counties_considered) - save_counter(df_count_incid_depend, 'joined_codes_incid_depend') - plot_counter('joined_codes_incid_depend') + save_counter(df_count_incid_depend, 'joined_codes_incid_depend', directory) + plot_counter('joined_codes_incid_depend', directory) # print sub counters print('Sub task counters are: ') @@ -1261,9 +1261,8 @@ def count_codes_incid_depend(df_incid_depend, df_count_incid_depend, counties_co return df_count_incid_depend -def save_counter(df_count, filename): +def save_counter(df_count, filename, directory): # save results - directory = os.path.join(dd.defaultDict['out_folder'], 'Germany/') writer = pd.ExcelWriter( os.path.join(directory, filename + '.xlsx'), @@ -1275,8 +1274,7 @@ def save_counter(df_count, filename): # saves plot in folder directory/heatmaps_filename -def plot_counter(filename): - directory = os.path.join(dd.defaultDict['out_folder'], 'Germany/') +def plot_counter(filename, directory): target_directory = os.path.join(directory, 'heatmaps_' + filename) if not os.path.exists(target_directory): os.makedirs(target_directory) From c0df479569ec186bf3928639996f81e59cb4bb97 Mon Sep 17 00:00:00 2001 From: patricklnz Date: Tue, 18 Apr 2023 10:40:47 +0200 Subject: [PATCH 073/104] mock plot function --- pycode/memilio-epidata/memilio/epidata/getNPIData.py | 2 -- .../memilio/epidata_test/test_epidata_getNPIData.py | 4 +++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 6de9ffd41f..e3a1e7bd54 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -1200,8 +1200,6 @@ def get_npi_data(fine_resolution=2, else: filename = 'germany_counties_npi_maincat' gd.write_dataframe(df_npis, directory, filename, file_format) - gd.write_dataframe(df_count_incid_depend, directory, - 'joined_codes_incid_dependent', file_format) return df_npis diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py index a3dc77f84e..5d3c4a0773 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py @@ -24,6 +24,7 @@ import os import pandas as pd import numpy as np +import matplotlib.pyplot as plt from datetime import date @@ -276,7 +277,8 @@ def test_drop_codes_and_categories(self): return_value=[[], df_npis_desc['Variablenname'], df_npis_old_renamed]) - def test_get_npi_data(self, mock_codes, mock_read, mock_data): + @patch('memilio.epidata.getNPIData.plot_counter') + def test_get_npi_data(self, mock_plot, mock_codes, mock_read, mock_data): # print 'Additional errors in consistent naming' is expected. # print 'WARNING: DataFrame starts with reported cases > 0 for more than 5 percent...' is expected. npis_test = gnd.get_npi_data( From ecc7272828529f6f25063cb2e73a4a7aef81220e Mon Sep 17 00:00:00 2001 From: patricklnz Date: Thu, 20 Apr 2023 13:58:03 +0200 Subject: [PATCH 074/104] fix counter --- pycode/memilio-epidata/memilio/epidata/getNPIData.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index e3a1e7bd54..7aea86b180 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -1116,9 +1116,6 @@ def get_npi_data(fine_resolution=2, for incidcode in ['', '_1', '_2', '_3', '_4', '_5']: df_local_new[subcode+incidcode] *= df_merged[subcode] - save_counter(df_count_deactivation, 'count_deactivation', directory) - plot_counter('count_deactivation', directory) - counters[cid] += time.perf_counter()-start_time cid += 1 ### ### @@ -1144,6 +1141,9 @@ def get_npi_data(fine_resolution=2, '. Estimated time remaining: ' + str(int(time_remain / 60)) + ' min.') + save_counter(df_count_deactivation, 'count_deactivation', directory) + plot_counter('count_deactivation', directory) + if counter_cases_start >= len(counties_considered)*0.05: print('WARNING: DataFrame starts with reported cases > 0 ' 'for more than 5 percent of the counties to be considered. ' From 27df4fb40d6508a6878cf81fa26e6831349291b1 Mon Sep 17 00:00:00 2001 From: patricklnz Date: Fri, 21 Apr 2023 14:02:16 +0200 Subject: [PATCH 075/104] counter for active codes after all (de-)activation --- .../memilio/epidata/getNPIData.py | 54 +++++++++++++++---- 1 file changed, 45 insertions(+), 9 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 7aea86b180..9f06eec882 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -878,15 +878,15 @@ def get_npi_data(fine_resolution=2, df_npis_old.replace([-99, 2, 3, 4, 5], [0, 1, 1, 1, 1], inplace=True) counter_cases_start = 0 - # # setup dataframe for each maingroup, same format as df_npi_combinations - # # used to count codes that occur simultaneously now (before any (de-)activation) - # df_count_joined_codes = copy.deepcopy(df_npis_combinations) - # for subcode in df_count_joined_codes.keys(): - # df_count_joined_codes[subcode][1] *= 0 - # df_counted_joined_codes = count_codes(df_npis_old, df_count_joined_codes, - # counties_considered=counties_considered) - # save_counter(df_counted_joined_codes, 'joined_codes', directory) - # plot_counter('joined_codes', directory) + # setup dataframe for each maingroup, same format as df_npi_combinations + # used to count codes that occur simultaneously now (before any (de-)activation) + df_count_joined_codes = copy.deepcopy(df_npis_combinations) + for subcode in df_count_joined_codes.keys(): + df_count_joined_codes[subcode][1] *= 0 + df_counted_joined_codes = count_codes(df_npis_old, df_count_joined_codes, + counties_considered=counties_considered) + save_counter(df_counted_joined_codes, 'joined_codes', directory) + plot_counter('joined_codes', directory) # create dataframe to count multiple codes after incidence dependent (de-)activation df_incid_depend = pd.DataFrame() @@ -894,6 +894,11 @@ def get_npi_data(fine_resolution=2, for maincode in df_count_incid_depend.keys(): df_count_incid_depend[maincode][1] *= 0 + # create dataframe to count multiple codes after strictness deactivation + df_count_active = copy.deepcopy(df_npis_combinations) + for maincode in df_count_active.keys(): + df_count_active[maincode][1] *= 0 + # setup dataframe for each maingroup, same format as df_npi_combinations # used to count number of codes that are deactivated df_count_deactivation = copy.deepcopy(df_npis_combinations) @@ -1106,6 +1111,10 @@ def get_npi_data(fine_resolution=2, df_merged.loc[subcode_active, nocombi_code] = 0 df_count_deactivation[maincode][1].loc[idx_strictness, nocombi_code] += len(days_deact) + + count_codes_active(df_merged, df_count_active, counties_considered) + save_counter(df_count_active, 'joined_codes_active', directory) + plot_counter('joined_codes_active', directory) # for fine resolution = 1 only consider merged dataframe if fine_resolution == 1: @@ -1230,6 +1239,33 @@ def count_codes(df_npis_old, df_count, counties_considered): code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) return df_count +def count_codes_active(df_merged, df_count_active, counties_considered): + for county in counties_considered: + df_local = df_merged[df_merged[dd.EngEng['idCounty']] == county] + code_dict = {} + for maincode in df_count_active.keys(): + for column in df_count_active[maincode][1].columns: + code_dict[column] = df_local.iloc[np.where( + df_local.loc[:, df_local.columns.str.contains(column)].max(axis=1) > 0)[0], 0].to_list() + + # with diag + # for maincode in df_count_active.keys(): + # column_list = df_count_active[maincode][1].columns + # for column in range(len(column_list)): + # for column_other in range(len(column_list)): + # df_count_active[maincode][1].iloc[column, column_other] += len(set( + # code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) + + # no diag + for maincode in df_count_active.keys(): + column_list = df_count_active[maincode][1].columns + for column in range(len(column_list)): + for column_other in range(column): + df_count_active[maincode][1].iloc[column, column_other] += len(set( + code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) + + return df_count_active + def count_codes_incid_depend(df_incid_depend, df_count_incid_depend, counties_considered): for county in counties_considered: From d975ca6a5fb86a5b019e9a8ea2d0fd310e80da65 Mon Sep 17 00:00:00 2001 From: patricklnz Date: Tue, 25 Apr 2023 14:06:32 +0200 Subject: [PATCH 076/104] adjust counter --- .../memilio/epidata/getNPIData.py | 110 +++++++++--------- 1 file changed, 56 insertions(+), 54 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 9f06eec882..26d2eabcfd 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -1111,10 +1111,13 @@ def get_npi_data(fine_resolution=2, df_merged.loc[subcode_active, nocombi_code] = 0 df_count_deactivation[maincode][1].loc[idx_strictness, nocombi_code] += len(days_deact) - - count_codes_active(df_merged, df_count_active, counties_considered) - save_counter(df_count_active, 'joined_codes_active', directory) - plot_counter('joined_codes_active', directory) + + # count joined codes from after strictness based deactivation + count_codes_active(df_merged, df_count_active, countyID) + + # count joined codes from after incidence based activation + count_codes_incid_depend( + df_incid_depend, df_count_incid_depend, countyID) # for fine resolution = 1 only consider merged dataframe if fine_resolution == 1: @@ -1161,12 +1164,12 @@ def get_npi_data(fine_resolution=2, 'Please consider a start date of some weeks ahead of the ' 'time window to be analyzed for NPI\'s effects.') - # count joined codes from after incidence based activation - count_codes_incid_depend( - df_incid_depend, df_count_incid_depend, counties_considered) save_counter(df_count_incid_depend, 'joined_codes_incid_depend', directory) plot_counter('joined_codes_incid_depend', directory) + save_counter(df_count_active, 'joined_codes_active', directory) + plot_counter('joined_codes_active', directory) + # print sub counters print('Sub task counters are: ') print(counters) @@ -1239,58 +1242,57 @@ def count_codes(df_npis_old, df_count, counties_considered): code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) return df_count -def count_codes_active(df_merged, df_count_active, counties_considered): - for county in counties_considered: - df_local = df_merged[df_merged[dd.EngEng['idCounty']] == county] - code_dict = {} - for maincode in df_count_active.keys(): - for column in df_count_active[maincode][1].columns: - code_dict[column] = df_local.iloc[np.where( - df_local.loc[:, df_local.columns.str.contains(column)].max(axis=1) > 0)[0], 0].to_list() - # with diag - # for maincode in df_count_active.keys(): - # column_list = df_count_active[maincode][1].columns - # for column in range(len(column_list)): - # for column_other in range(len(column_list)): - # df_count_active[maincode][1].iloc[column, column_other] += len(set( - # code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) - - # no diag - for maincode in df_count_active.keys(): - column_list = df_count_active[maincode][1].columns - for column in range(len(column_list)): - for column_other in range(column): - df_count_active[maincode][1].iloc[column, column_other] += len(set( - code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) +def count_codes_active(df_merged, df_count_active, county): + df_local = df_merged[df_merged[dd.EngEng['idCounty']] == county] + code_dict = {} + for maincode in df_count_active.keys(): + for column in df_count_active[maincode][1].columns: + code_dict[column] = df_local.iloc[np.where( + df_local.loc[:, df_local.columns.str.contains(column)].max(axis=1) > 0)[0], 0].to_list() + + # with diag + # for maincode in df_count_active.keys(): + # column_list = df_count_active[maincode][1].columns + # for column in range(len(column_list)): + # for column_other in range(len(column_list)): + # df_count_active[maincode][1].iloc[column, column_other] += len(set( + # code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) + + # no diag + for maincode in df_count_active.keys(): + column_list = df_count_active[maincode][1].columns + for column in range(len(column_list)): + for column_other in range(column): + df_count_active[maincode][1].iloc[column, column_other] += len(set( + code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) return df_count_active -def count_codes_incid_depend(df_incid_depend, df_count_incid_depend, counties_considered): - for county in counties_considered: - df_local = df_incid_depend[df_incid_depend[dd.EngEng['idCounty']] == county] - code_dict = {} - for maincode in df_count_incid_depend.keys(): - for column in df_count_incid_depend[maincode][1].columns: - code_dict[column] = df_local.iloc[np.where( - df_local.loc[:, df_local.columns.str.contains(column)].max(axis=1) > 0)[0], 0].to_list() - - # with diag - # for maincode in df_count_incid_depend.keys(): - # column_list = df_count_incid_depend[maincode][1].columns - # for column in range(len(column_list)): - # for column_other in range(len(column_list)): - # df_count_incid_depend[maincode][1].iloc[column, column_other] += len(set( - # code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) - - # no diag - for maincode in df_count_incid_depend.keys(): - column_list = df_count_incid_depend[maincode][1].columns - for column in range(len(column_list)): - for column_other in range(column): - df_count_incid_depend[maincode][1].iloc[column, column_other] += len(set( - code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) +def count_codes_incid_depend(df_incid_depend, df_count_incid_depend, county): + df_local = df_incid_depend[df_incid_depend[dd.EngEng['idCounty']] == county] + code_dict = {} + for maincode in df_count_incid_depend.keys(): + for column in df_count_incid_depend[maincode][1].columns: + code_dict[column] = df_local.iloc[np.where( + df_local.loc[:, df_local.columns.str.contains(column)].max(axis=1) > 0)[0], 0].to_list() + + # with diag + # for maincode in df_count_incid_depend.keys(): + # column_list = df_count_incid_depend[maincode][1].columns + # for column in range(len(column_list)): + # for column_other in range(len(column_list)): + # df_count_incid_depend[maincode][1].iloc[column, column_other] += len(set( + # code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) + + # no diag + for maincode in df_count_incid_depend.keys(): + column_list = df_count_incid_depend[maincode][1].columns + for column in range(len(column_list)): + for column_other in range(column): + df_count_incid_depend[maincode][1].iloc[column, column_other] += len(set( + code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) return df_count_incid_depend From 478d8fbd7632ee9f88bcc61bad44c07a5ddd9cdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Fri, 28 Apr 2023 17:05:24 +0200 Subject: [PATCH 077/104] refactoring of count_codes with some small naming improvements and to better tackle diagonal --- .../memilio/epidata/getNPIData.py | 84 ++++++++++++------- 1 file changed, 53 insertions(+), 31 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 26d2eabcfd..9a30795b0d 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -880,13 +880,13 @@ def get_npi_data(fine_resolution=2, # setup dataframe for each maingroup, same format as df_npi_combinations # used to count codes that occur simultaneously now (before any (de-)activation) - df_count_joined_codes = copy.deepcopy(df_npis_combinations) - for subcode in df_count_joined_codes.keys(): - df_count_joined_codes[subcode][1] *= 0 - df_counted_joined_codes = count_codes(df_npis_old, df_count_joined_codes, + df_count_joint_codes = copy.deepcopy(df_npis_combinations) + for maincode in df_count_joint_codes.keys(): + df_count_joint_codes[maincode][1] *= 0 + df_counted_joint_codes = count_code_multiplicities_init(df_npis_old, df_count_joint_codes, counties_considered=counties_considered) - save_counter(df_counted_joined_codes, 'joined_codes', directory) - plot_counter('joined_codes', directory) + save_counter(df_counted_joint_codes, 'joint_codes', directory) + plot_counter('joint_codes', directory) # create dataframe to count multiple codes after incidence dependent (de-)activation df_incid_depend = pd.DataFrame() @@ -1112,10 +1112,10 @@ def get_npi_data(fine_resolution=2, df_count_deactivation[maincode][1].loc[idx_strictness, nocombi_code] += len(days_deact) - # count joined codes from after strictness based deactivation + # count joint codes from after strictness based deactivation count_codes_active(df_merged, df_count_active, countyID) - # count joined codes from after incidence based activation + # count joint codes from after incidence based activation count_codes_incid_depend( df_incid_depend, df_count_incid_depend, countyID) @@ -1164,11 +1164,11 @@ def get_npi_data(fine_resolution=2, 'Please consider a start date of some weeks ahead of the ' 'time window to be analyzed for NPI\'s effects.') - save_counter(df_count_incid_depend, 'joined_codes_incid_depend', directory) - plot_counter('joined_codes_incid_depend', directory) + save_counter(df_count_incid_depend, 'joint_codes_incid_depend', directory) + plot_counter('joint_codes_incid_depend', directory) - save_counter(df_count_active, 'joined_codes_active', directory) - plot_counter('joined_codes_active', directory) + save_counter(df_count_active, 'joint_codes_active', directory) + plot_counter('joint_codes_active', directory) # print sub counters print('Sub task counters are: ') @@ -1216,30 +1216,52 @@ def get_npi_data(fine_resolution=2, return df_npis -def count_codes(df_npis_old, df_count, counties_considered): +def count_code_multiplicities_init(df_npis_old, df_count, counties_considered): + """! Count multiply for all pairs of NPI codes how many times they were + mentioned at the same day in the initial data frame. + + @param df_npis_old Initial data frame read from Corona Datenplattform. + @param df_count Dictionnary of main NPI codes with empty interaction + matrix (to be filled) for all codes under main code in df_count[maincode][1] + @param counties_considered County IDs for which initial data frame is + considered. + """ for county in counties_considered: df_local = df_npis_old[df_npis_old[dd.EngEng['idCounty']] == county] - code_dict = {} + # get column where dates start + npi_date_start_col = np.where( + df_local.columns.str.startswith('d20') == True)[0][0] + # prepare dictionnary for dates when code was mentioned + code_dates = {} + # run through all maincodes (i.e., first 3-4 characters like M01a or M11) for maincode in df_count.keys(): - for column in df_count[maincode][1].columns: - code_dict[column] = df_local.iloc[:, 6+np.where( - df_local[df_local.NPI_code.str.contains(column)].iloc[:, 6:].max() > 0)[0]].columns - - # with diag - # for code in df_count.keys(): - # column_list = df_count[code][1].columns - # for column in range(len(column_list)): - # for column_other in range(len(column_list)): - # df_count[code][1].iloc[column, column_other] += len(set( - # code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) + code_list = df_count[maincode][1].columns + # iterate over code/row indices 0 to n + for code_idx in range(len(code_list)): + # get dates where NPI is mentioned as existing in potential intervention set + npi_rows = df_local.NPI_code.str.contains(code_list[code_idx]) + npi_dates_in_df = np.where( + df_local[npi_rows].iloc[:, npi_date_start_col:].max() > 0)[0] + # store non-transforemed dates in code_dict + code_dates[code_list[code_idx]] = df_local.iloc[:, + npi_date_start_col + npi_dates_in_df].columns + + # count number of multiply mentionned NPIs with different incidence thresholds for the same day + df_count[maincode][1].iloc[code_idx, code_idx] = df_local[npi_rows].iloc[:, + npi_date_start_col + npi_dates_in_df].sum().sum() - len(npi_dates_in_df) # no diag for maincode in df_count.keys(): - column_list = df_count[maincode][1].columns - for column in range(len(column_list)): - for column_other in range(column): - df_count[maincode][1].iloc[column, column_other] += len(set( - code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) + code_list = df_count[maincode][1].columns + # iterate over rows in matrix df_count with code/row indices 0 to n + for code_idx in range(len(code_list)): + # iterate over code/column indices 0 to code_idx-1 (not filling diagonal) + # Note that the upper diagonal part of the matrix does not + # need to be considered as matrix is symmetric. + for code_idx_other in range(code_idx): + df_count[maincode][1].iloc[code_idx, code_idx_other] += len(set( + code_dates[code_list[code_idx]]).intersection(set(code_dates[code_list[code_idx_other]]))) + return df_count @@ -1334,7 +1356,7 @@ def plot_counter(filename, directory): # set vmin = 1 so that only combinations that are simultaneously active at least on one day are in colour, # else white # set vmax = 300000, this should be larger than maxima in all dataframes, - # this way colours of heatmaps are comparable (e.g. between codes or between joined_codes and exclusions) + # this way colours of heatmaps are comparable (e.g. between codes or between joint_codes and exclusions) plt.imshow(array_exclusion, cmap=cmap, vmin=1, vmax=300000) plt.colorbar() plt.savefig( From 11f13889c70d696c7974cf089ed670ec3d200bd3 Mon Sep 17 00:00:00 2001 From: patricklnz Date: Mon, 8 May 2023 12:12:09 +0200 Subject: [PATCH 078/104] adjust plotting --- pycode/memilio-epidata/memilio/epidata/getNPIData.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 9a30795b0d..ac67134fe5 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -1340,8 +1340,7 @@ def plot_counter(filename, directory): codelist = pd.ExcelFile(os.path.join( directory, filename + '.xlsx'), engine='openpyxl').sheet_names - cmap = copy.copy(mpl.cm.get_cmap('cool')) - cmap.set_under('white') + cmap = copy.copy(mpl.cm.get_cmap('OrRd')) for code in codelist: df = pd.read_excel( @@ -1357,7 +1356,7 @@ def plot_counter(filename, directory): # else white # set vmax = 300000, this should be larger than maxima in all dataframes, # this way colours of heatmaps are comparable (e.g. between codes or between joint_codes and exclusions) - plt.imshow(array_exclusion, cmap=cmap, vmin=1, vmax=300000) + plt.imshow(array_exclusion, cmap=cmap, norm=mpl.colors.LogNorm(vmin = 1, vmax=300000)) plt.colorbar() plt.savefig( os.path.join(target_directory, filename + '_{}'.format( From 1c43290c7afaabfd278ba7d6e05c42a1d45d5ab6 Mon Sep 17 00:00:00 2001 From: patricklnz Date: Mon, 8 May 2023 14:31:30 +0200 Subject: [PATCH 079/104] plot diag --- .../memilio/epidata/getNPIData.py | 39 ++++++++++++++++--- 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index ac67134fe5..989626459f 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -884,9 +884,10 @@ def get_npi_data(fine_resolution=2, for maincode in df_count_joint_codes.keys(): df_count_joint_codes[maincode][1] *= 0 df_counted_joint_codes = count_code_multiplicities_init(df_npis_old, df_count_joint_codes, - counties_considered=counties_considered) + counties_considered=counties_considered) save_counter(df_counted_joint_codes, 'joint_codes', directory) plot_counter('joint_codes', directory) + plot_multiple_prescriptions('joint_codes', directory) # create dataframe to count multiple codes after incidence dependent (de-)activation df_incid_depend = pd.DataFrame() @@ -1230,7 +1231,7 @@ def count_code_multiplicities_init(df_npis_old, df_count, counties_considered): df_local = df_npis_old[df_npis_old[dd.EngEng['idCounty']] == county] # get column where dates start npi_date_start_col = np.where( - df_local.columns.str.startswith('d20') == True)[0][0] + df_local.columns.str.startswith('d2') == True)[0][0] # prepare dictionnary for dates when code was mentioned code_dates = {} # run through all maincodes (i.e., first 3-4 characters like M01a or M11) @@ -1247,7 +1248,7 @@ def count_code_multiplicities_init(df_npis_old, df_count, counties_considered): npi_date_start_col + npi_dates_in_df].columns # count number of multiply mentionned NPIs with different incidence thresholds for the same day - df_count[maincode][1].iloc[code_idx, code_idx] = df_local[npi_rows].iloc[:, + df_count[maincode][1].iloc[code_idx, code_idx] += df_local[npi_rows].iloc[:, npi_date_start_col + npi_dates_in_df].sum().sum() - len(npi_dates_in_df) # no diag @@ -1349,14 +1350,14 @@ def plot_counter(filename, directory): array_exclusion = df.iloc[:, 1:].to_numpy() fig = plt.figure() positions = [i for i in range(len(df.columns)-1)] - plt.xticks(positions, [colname[-3:] - for colname in df.columns.to_list()[1:]]) + plt.xticks(positions, df.columns.to_list()[1:], rotation='vertical') plt.yticks(positions, df.columns.to_list()[1:]) # set vmin = 1 so that only combinations that are simultaneously active at least on one day are in colour, # else white # set vmax = 300000, this should be larger than maxima in all dataframes, # this way colours of heatmaps are comparable (e.g. between codes or between joint_codes and exclusions) - plt.imshow(array_exclusion, cmap=cmap, norm=mpl.colors.LogNorm(vmin = 1, vmax=300000)) + plt.imshow(array_exclusion, cmap=cmap, + norm=mpl.colors.LogNorm(vmin=1, vmax=300000)) plt.colorbar() plt.savefig( os.path.join(target_directory, filename + '_{}'.format( @@ -1364,6 +1365,32 @@ def plot_counter(filename, directory): plt.close() +def plot_multiple_prescriptions(filename, directory): + target_directory = os.path.join(directory, 'heatmaps_mult_presc_' + filename) + if not os.path.exists(target_directory): + os.makedirs(target_directory) + + codelist = pd.ExcelFile(os.path.join( + directory, filename + '.xlsx'), engine='openpyxl').sheet_names + + cmap = copy.copy(mpl.cm.get_cmap('OrRd')) + + for code in codelist: + df = pd.read_excel( + os.path.join(directory, filename + '.xlsx'), + sheet_name=code, engine='openpyxl') + array_exclusion = df.iloc[:, 1:].to_numpy() + fig = plt.figure() + positions = [i for i in range(len(df.columns)-1)] + plt.yticks(positions, df.columns.to_list()[1:]) + plt.imshow(array_exclusion.diagonal(), cmap=cmap) + plt.title(code) + plt.savefig( + os.path.join(target_directory, filename + '_{}'.format( + code))) + plt.close() + + def main(): """! Main program entry.""" From c4608c7492ea93681c0820a60339d10e57abce6e Mon Sep 17 00:00:00 2001 From: patricklnz Date: Mon, 8 May 2023 14:51:40 +0200 Subject: [PATCH 080/104] merge count functions --- .../memilio/epidata/getNPIData.py | 59 ++++--------------- 1 file changed, 13 insertions(+), 46 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 989626459f..0908580145 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -1114,10 +1114,10 @@ def get_npi_data(fine_resolution=2, nocombi_code] += len(days_deact) # count joint codes from after strictness based deactivation - count_codes_active(df_merged, df_count_active, countyID) + df_count_active = count_codes(df_merged, df_count_active, countyID) # count joint codes from after incidence based activation - count_codes_incid_depend( + df_count_incid_depend = count_codes( df_incid_depend, df_count_incid_depend, countyID) # for fine resolution = 1 only consider merged dataframe @@ -1266,58 +1266,25 @@ def count_code_multiplicities_init(df_npis_old, df_count, counties_considered): return df_count -def count_codes_active(df_merged, df_count_active, county): - df_local = df_merged[df_merged[dd.EngEng['idCounty']] == county] +def count_codes(df_old, df_count, county): + df_local = df_old[df_old[dd.EngEng['idCounty']] == county] code_dict = {} - for maincode in df_count_active.keys(): - for column in df_count_active[maincode][1].columns: + for maincode in df_count.keys(): + for column in df_count[maincode][1].columns: code_dict[column] = df_local.iloc[np.where( df_local.loc[:, df_local.columns.str.contains(column)].max(axis=1) > 0)[0], 0].to_list() - # with diag - # for maincode in df_count_active.keys(): - # column_list = df_count_active[maincode][1].columns - # for column in range(len(column_list)): - # for column_other in range(len(column_list)): - # df_count_active[maincode][1].iloc[column, column_other] += len(set( - # code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) - - # no diag - for maincode in df_count_active.keys(): - column_list = df_count_active[maincode][1].columns + # iterate over code/column indices 0 to code_idx-1 (not filling diagonal) + # Note that the upper diagonal part of the matrix does not + # need to be considered as matrix is symmetric. + for maincode in df_count.keys(): + column_list = df_count[maincode][1].columns for column in range(len(column_list)): for column_other in range(column): - df_count_active[maincode][1].iloc[column, column_other] += len(set( + df_count[maincode][1].iloc[column, column_other] += len(set( code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) - return df_count_active - - -def count_codes_incid_depend(df_incid_depend, df_count_incid_depend, county): - df_local = df_incid_depend[df_incid_depend[dd.EngEng['idCounty']] == county] - code_dict = {} - for maincode in df_count_incid_depend.keys(): - for column in df_count_incid_depend[maincode][1].columns: - code_dict[column] = df_local.iloc[np.where( - df_local.loc[:, df_local.columns.str.contains(column)].max(axis=1) > 0)[0], 0].to_list() - - # with diag - # for maincode in df_count_incid_depend.keys(): - # column_list = df_count_incid_depend[maincode][1].columns - # for column in range(len(column_list)): - # for column_other in range(len(column_list)): - # df_count_incid_depend[maincode][1].iloc[column, column_other] += len(set( - # code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) - - # no diag - for maincode in df_count_incid_depend.keys(): - column_list = df_count_incid_depend[maincode][1].columns - for column in range(len(column_list)): - for column_other in range(column): - df_count_incid_depend[maincode][1].iloc[column, column_other] += len(set( - code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) - - return df_count_incid_depend + return df_count def save_counter(df_count, filename, directory): From fe5df64a7a352861abb5c8918146ef36faecd090 Mon Sep 17 00:00:00 2001 From: patricklnz Date: Mon, 8 May 2023 15:55:25 +0200 Subject: [PATCH 081/104] review suggestions --- .../memilio/epidata/getNPIData.py | 39 ++++++++++++------- pycode/memilio-epidata/setup.py | 1 + 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 0908580145..1704647a53 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -506,6 +506,11 @@ def get_npi_data(fine_resolution=2, df_npis_old, df_npis_desc, df_npis_combinations_pre = read_files( directory, fine_resolution) + print('Download completed.') + + npi_start_col = np.where( + df_npis_old.columns.str.contains('d2') == True)[0][0] + # get existing codes that are used # for fine resolution we don't have codes M22 - M24 but are still listed in description if fine_resolution > 0: @@ -556,14 +561,14 @@ def get_npi_data(fine_resolution=2, except KeyError: pass # rename essential columns and throw away others - columns_combinations = np.where((df_npis_combinations_pre == 'x').any() == True)[ - 0] # maybe rename columns_used ? + columns_used = np.where( + (df_npis_combinations_pre == 'x').any() == True)[0] column_names = [ - 'Unnamed: ' + str(i) for i in range(columns_combinations[0], columns_combinations[-1]+1)] + 'Unnamed: ' + str(i) for i in range(columns_used[0], columns_used[-1]+1)] rename_columns = {column_names[i]: i for i in range(len(column_names))} df_npis_combinations_pre.rename(columns=rename_columns, inplace=True) df_npis_combinations_pre = df_npis_combinations_pre[[ - 'Variablenname', 'Massnahmenindex'] + [i for i in range(0, len(columns_combinations))]] + 'Variablenname', 'Massnahmenindex'] + [i for i in range(0, len(columns_used))]] # replace empty cells by zeros and x-marked cells by ones df_npis_combinations_pre = df_npis_combinations_pre.replace(np.nan, 0) df_npis_combinations_pre = df_npis_combinations_pre.replace('x', 1) @@ -582,9 +587,9 @@ def get_npi_data(fine_resolution=2, [npi_groups_combinations == code].index)) # TODO: look at: - # np.where(df_npis_old[df_npis_old.NPI_code.isin(pd.Series(['M16_100', 'M16_100_1', 'M16_100_2', 'M16_100_3', 'M16_100_4', 'M16_100_5']))].iloc[:,6:]==1) - # np.where(df_npis_old[df_npis_old.NPI_code.isin(pd.Series(['M01a_020', 'M01a_020_1', 'M01a_020_2', 'M01a_020_3', 'M01a_020_4', 'M01a_020_5']))].iloc[:,6:]==1) - # np.where(df_npis_old[df_npis_old.NPI_code.isin(pd.Series(['M01b_020', 'M01b_020_1', 'M01b_020_2', 'M01b_020_3', 'M01b_020_4', 'M01b_020_5']))].iloc[:,6:]==1) + # np.where(df_npis_old[df_npis_old.NPI_code.isin(pd.Series(['M16_100', 'M16_100_1', 'M16_100_2', 'M16_100_3', 'M16_100_4', 'M16_100_5']))].iloc[:,npi_start_col:]==1) + # np.where(df_npis_old[df_npis_old.NPI_code.isin(pd.Series(['M01a_020', 'M01a_020_1', 'M01a_020_2', 'M01a_020_3', 'M01a_020_4', 'M01a_020_5']))].iloc[:,npi_start_col:]==1) + # np.where(df_npis_old[df_npis_old.NPI_code.isin(pd.Series(['M01b_020', 'M01b_020_1', 'M01b_020_2', 'M01b_020_3', 'M01b_020_4', 'M01b_020_5']))].iloc[:,npi_start_col:]==1) # create hash table of main code to strictness rankings inside main # code and combination matrix inside the same strictness rank @@ -954,13 +959,15 @@ def get_npi_data(fine_resolution=2, df_local_old = copy.deepcopy(df_npis_old[df_npis_old[dd.EngEng['idCounty']] == countyID]) + inc_codes = 6 # Is this always 6? + # Consistency of incidence dependent NPIs: # The same NPI should not be prescribed multiple times at the same day # for different thresholds. In order to avoid contradictions, only # retain the strictest mentioned implementation. - for i in range(int(len(df_local_old)/6)): + for i in range(int(len(df_local_old)/inc_codes)): sum_npi_inc = np.where( - df_local_old.iloc[6*i:6*(i+1), 6:].sum() > 1) + df_local_old.iloc[inc_codes*i:inc_codes*(i+1), npi_start_col:].sum() > 1) if len(sum_npi_inc[0]): print( 'Reduce multiple prescription in county ' + str(countyID) + @@ -968,15 +975,16 @@ def get_npi_data(fine_resolution=2, for j in sum_npi_inc[0]: # get lowest index (i.e., strictest implementation of NPI). idx_start = np.where( - df_local_old.iloc[6*i:6*(i+1), 6+j])[0].min() + df_local_old.iloc[inc_codes*i:inc_codes*(i+1), npi_start_col+j])[0].min() # Remove less strict and thus contradictory # implementations of the same NPI the same day. - df_local_old.iloc[6*i+idx_start+1:6*(i+1), 6+j] = 0 + df_local_old.iloc[inc_codes*i+idx_start + + 1:inc_codes*(i+1), npi_start_col+j] = 0 if not all( df_local_old.iloc - [6 * i: 6 * (i + 1), - 6 + sum_npi_inc[0]].sum() == 1): + [inc_codes * i: inc_codes * (i + 1), + npi_start_col + sum_npi_inc[0]].sum() == 1): raise gd.DataError('Consistency correction failed.') ## end of consistency correction ## @@ -1249,7 +1257,7 @@ def count_code_multiplicities_init(df_npis_old, df_count, counties_considered): # count number of multiply mentionned NPIs with different incidence thresholds for the same day df_count[maincode][1].iloc[code_idx, code_idx] += df_local[npi_rows].iloc[:, - npi_date_start_col + npi_dates_in_df].sum().sum() - len(npi_dates_in_df) + npi_date_start_col + npi_dates_in_df].sum().sum() - len(npi_dates_in_df) # no diag for maincode in df_count.keys(): @@ -1333,7 +1341,8 @@ def plot_counter(filename, directory): def plot_multiple_prescriptions(filename, directory): - target_directory = os.path.join(directory, 'heatmaps_mult_presc_' + filename) + target_directory = os.path.join( + directory, 'heatmaps_mult_presc_' + filename) if not os.path.exists(target_directory): os.makedirs(target_directory) diff --git a/pycode/memilio-epidata/setup.py b/pycode/memilio-epidata/setup.py index e0e93562dc..51722cdcf2 100644 --- a/pycode/memilio-epidata/setup.py +++ b/pycode/memilio-epidata/setup.py @@ -74,6 +74,7 @@ def run(self): 'numpy>=1.21', 'openpyxl', 'xlrd', + 'xlsxwriter', 'requests', 'pyxlsb', 'wget' From 99f2d41a09ac25bec703ba07833c00b81bc9d1da Mon Sep 17 00:00:00 2001 From: patricklnz Date: Tue, 9 May 2023 12:09:35 +0200 Subject: [PATCH 082/104] fix some errors --- .../memilio/epidata/getNPIData.py | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 1704647a53..0305153e0a 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -368,14 +368,14 @@ def drop_codes_and_categories( @return Returns dropped codes, prior codes and reduced original data frame. """ if fine_resolution > 0: + + for i in range(1, 6): # correct M04_N codes to M_04_M_N, N in {1,...,5}, M in {120,110,100,130,140} # (M04_1, i.e. i=1, has been corrected in original file but not for i>1) - for i in range(2, 6): - npi_codes_prior[npi_codes_prior == 'M04_'+str(i)] = ['M04_120_'+str( - i), 'M04_110_'+str(i), 'M04_100_'+str(i), 'M04_130_'+str(i), 'M04_140_'+str(i)] - + if i != 1: + npi_codes_prior[npi_codes_prior == 'M04_'+str(i)] = ['M04_120_'+str( + i), 'M04_110_'+str(i), 'M04_100_'+str(i), 'M04_130_'+str(i), 'M04_140_'+str(i)] # correct M05_N codes to M_05_M_N, N in {1,...,5}, M in {130,150,120,140,110,100,160} - for i in range(1, 6): npi_codes_prior[npi_codes_prior == 'M05_'+str(i)] = ['M05_130_'+str(i), 'M05_150_'+str( i), 'M05_120_'+str(i), 'M05_140_'+str(i), 'M05_110_'+str(i), 'M05_100_'+str(i), 'M05_160_'+str(i)] @@ -959,7 +959,7 @@ def get_npi_data(fine_resolution=2, df_local_old = copy.deepcopy(df_npis_old[df_npis_old[dd.EngEng['idCounty']] == countyID]) - inc_codes = 6 # Is this always 6? + inc_codes = len(np.where(df_npis.columns.str.contains('M01a_010'))[0]) # Consistency of incidence dependent NPIs: # The same NPI should not be prescribed multiple times at the same day @@ -971,7 +971,7 @@ def get_npi_data(fine_resolution=2, if len(sum_npi_inc[0]): print( 'Reduce multiple prescription in county ' + str(countyID) + - ' for NPI ' + str(npis.loc[i, 'Description'])) + ' for NPI ' + str(npis.loc[inc_codes*i, 'Description'])) for j in sum_npi_inc[0]: # get lowest index (i.e., strictest implementation of NPI). idx_start = np.where( @@ -1323,7 +1323,7 @@ def plot_counter(filename, directory): os.path.join(directory, filename + '.xlsx'), sheet_name=code, engine='openpyxl') array_exclusion = df.iloc[:, 1:].to_numpy() - fig = plt.figure() + fig = plt.figure(figsize=(10,12)) positions = [i for i in range(len(df.columns)-1)] plt.xticks(positions, df.columns.to_list()[1:], rotation='vertical') plt.yticks(positions, df.columns.to_list()[1:]) @@ -1359,7 +1359,10 @@ def plot_multiple_prescriptions(filename, directory): fig = plt.figure() positions = [i for i in range(len(df.columns)-1)] plt.yticks(positions, df.columns.to_list()[1:]) - plt.imshow(array_exclusion.diagonal(), cmap=cmap) + plt.xticks([]) + plt.imshow(np.array([array_exclusion.diagonal()]).T, + cmap=cmap, norm=mpl.colors.LogNorm(vmin=1, vmax=50000)) + plt.colorbar() plt.title(code) plt.savefig( os.path.join(target_directory, filename + '_{}'.format( From 9f3be268638f6281835c04e371712a7eb8f91964 Mon Sep 17 00:00:00 2001 From: patricklnz Date: Mon, 22 May 2023 08:55:23 +0200 Subject: [PATCH 083/104] don't plot diag --- pycode/memilio-epidata/memilio/epidata/getNPIData.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 0305153e0a..44d97cd05e 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -1322,6 +1322,9 @@ def plot_counter(filename, directory): df = pd.read_excel( os.path.join(directory, filename + '.xlsx'), sheet_name=code, engine='openpyxl') + # set diag = 0 + for i in range(df.shape[0]): + df.iloc[i,i+1]=0 array_exclusion = df.iloc[:, 1:].to_numpy() fig = plt.figure(figsize=(10,12)) positions = [i for i in range(len(df.columns)-1)] @@ -1374,6 +1377,8 @@ def main(): """! Main program entry.""" # arg_dict = gd.cli("testing") + directory = os.path.join(dd.defaultDict['out_folder'], 'Germany/') + plot_counter('joint_codes', directory) df = get_npi_data(fine_resolution=2, file_format='csv') From 1c253290d13e41a9b1780bd5bbbca169ceca75c3 Mon Sep 17 00:00:00 2001 From: patricklnz Date: Mon, 22 May 2023 10:15:29 +0200 Subject: [PATCH 084/104] count plot_counter calls in test --- pycode/memilio-epidata/memilio/epidata/getNPIData.py | 5 ++--- .../memilio/epidata_test/test_epidata_getNPIData.py | 5 ++++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 44d97cd05e..aaed45b24d 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -959,7 +959,8 @@ def get_npi_data(fine_resolution=2, df_local_old = copy.deepcopy(df_npis_old[df_npis_old[dd.EngEng['idCounty']] == countyID]) - inc_codes = len(np.where(df_npis.columns.str.contains('M01a_010'))[0]) + inc_codes = len(np.where(df_npis.columns.str.contains( + npis[dd.EngEng['npiCode']][0]))[0]) # Consistency of incidence dependent NPIs: # The same NPI should not be prescribed multiple times at the same day @@ -1377,8 +1378,6 @@ def main(): """! Main program entry.""" # arg_dict = gd.cli("testing") - directory = os.path.join(dd.defaultDict['out_folder'], 'Germany/') - plot_counter('joint_codes', directory) df = get_npi_data(fine_resolution=2, file_format='csv') diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py index 5d3c4a0773..467e25dabb 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py @@ -278,7 +278,8 @@ def test_drop_codes_and_categories(self): df_npis_desc['Variablenname'], df_npis_old_renamed]) @patch('memilio.epidata.getNPIData.plot_counter') - def test_get_npi_data(self, mock_plot, mock_codes, mock_read, mock_data): + @patch('memilio.epidata.getNPIData.plot_multiple_prescriptions') + def test_get_npi_data(self, mock_plot_mult, mock_plot, mock_codes, mock_read, mock_data): # print 'Additional errors in consistent naming' is expected. # print 'WARNING: DataFrame starts with reported cases > 0 for more than 5 percent...' is expected. npis_test = gnd.get_npi_data( @@ -331,6 +332,8 @@ def test_get_npi_data(self, mock_plot, mock_codes, mock_read, mock_data): npis_test.M1_1_1.to_list(), [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + self.assertEqual(mock_plot.call_count, 4) + self.assertEqual(mock_plot_mult.call_count, 1) if __name__ == '__main__': unittest.main() From 8275e087a17b238f2e7859108e34b43fcd8a4642 Mon Sep 17 00:00:00 2001 From: patricklnz Date: Tue, 13 Jun 2023 12:54:49 +0200 Subject: [PATCH 085/104] adjust plotting --- .../memilio/epidata/getNPIData.py | 66 +++++++++++++++++-- 1 file changed, 60 insertions(+), 6 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index aaed45b24d..cac35dd390 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -370,8 +370,8 @@ def drop_codes_and_categories( if fine_resolution > 0: for i in range(1, 6): - # correct M04_N codes to M_04_M_N, N in {1,...,5}, M in {120,110,100,130,140} - # (M04_1, i.e. i=1, has been corrected in original file but not for i>1) + # correct M04_N codes to M_04_M_N, N in {1,...,5}, M in {120,110,100,130,140} + # (M04_1, i.e. i=1, has been corrected in original file but not for i>1) if i != 1: npi_codes_prior[npi_codes_prior == 'M04_'+str(i)] = ['M04_120_'+str( i), 'M04_110_'+str(i), 'M04_100_'+str(i), 'M04_130_'+str(i), 'M04_140_'+str(i)] @@ -1325,22 +1325,48 @@ def plot_counter(filename, directory): sheet_name=code, engine='openpyxl') # set diag = 0 for i in range(df.shape[0]): - df.iloc[i,i+1]=0 + df.iloc[i, i+1] = 0 array_exclusion = df.iloc[:, 1:].to_numpy() - fig = plt.figure(figsize=(10,12)) + if filename != 'count_deactivation': + # for count deactivation xlabel != ylabel + # else matrix is of squared form + array_exclusion += np.transpose(array_exclusion) positions = [i for i in range(len(df.columns)-1)] plt.xticks(positions, df.columns.to_list()[1:], rotation='vertical') plt.yticks(positions, df.columns.to_list()[1:]) + + # use different labels and title for each filename + if filename == 'count_deactivation': + plt.xlabel('Second NPI') + plt.ylabel('First NPI') + plt.title('NPI deactivations') + elif filename == 'joint_codes_incid_depend': + plt.xlabel('NPI') + plt.ylabel('NPI') + plt.title('Joint NPI prescriptions (incidence dependent)') + elif filename == 'joint_codes_active': + plt.xlabel('NPI') + plt.ylabel('NPI') + plt.title('Joint NPI implementations') + elif filename == 'joint_codes': + plt.xlabel('NPI') + plt.ylabel('NPI') + plt.title('Joint NPI prescriptions') + else: + raise gd.DataError('unknown filename: '+filename) + # set vmin = 1 so that only combinations that are simultaneously active at least on one day are in colour, # else white # set vmax = 300000, this should be larger than maxima in all dataframes, # this way colours of heatmaps are comparable (e.g. between codes or between joint_codes and exclusions) + plt.imshow(array_exclusion, cmap=cmap, norm=mpl.colors.LogNorm(vmin=1, vmax=300000)) plt.colorbar() + plt.tight_layout() plt.savefig( os.path.join(target_directory, filename + '_{}'.format( - code))) + code)), dpi=300) plt.close() @@ -1368,12 +1394,40 @@ def plot_multiple_prescriptions(filename, directory): cmap=cmap, norm=mpl.colors.LogNorm(vmin=1, vmax=50000)) plt.colorbar() plt.title(code) + plt.tight_layout() plt.savefig( os.path.join(target_directory, filename + '_{}'.format( - code))) + code)), dpi=300) plt.close() +def plot_activation(df_npis_old, df_final, directory, maincode, subcodes): + counties = df_final.ID_County.unique() + for scode in subcodes: + name = maincode+scode + mentions = df_npis_old[df_npis_old.NPI_code == + name].iloc[:, 6:].sum(axis=0) + for inc_idx in ['_1', '_2', '_3', '_4', '_5']: + mentions += df_npis_old[df_npis_old.NPI_code == + name+inc_idx].iloc[:, 6:].sum(axis=0) + tick_range = (np.arange(int(len(mentions) / 100) + 1) * 100) + plt.xticks(tick_range) + plt.plot(mentions.index, mentions, c='firebrick', + label=name + ' mentioned') + implementations = mentions[54:].copy() + implementations *= 0 + for county in counties: + implementations += df_final[df_final.ID_County == + county].filter(regex=name).sum(axis=1).values + plt.plot(implementations.index, implementations, + c='g', label=name + ' active') + plt.grid(True) + plt.legend(loc='best') + plt.tight_layout + plt.savefig(directory+'npi_plots/'+name) + plt.clf() + + def main(): """! Main program entry.""" From cfeab1bacc416466e802c2f8b32c192d0599312f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Tue, 8 Aug 2023 13:28:48 +0200 Subject: [PATCH 086/104] correct and improve comment example --- .../memilio/epidata/getNPIData.py | 28 +++++++++++-------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 28d6763ac9..ec7c1a0b64 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -274,11 +274,15 @@ def activate_npis_based_on_incidence( If one of the former cases holds true, then the activation or lifting happens two days after the satisfaction of the criterion. This is in accordance with - case reporting that can only happen after the day has finished and as these - reports generally appeared in the morning for the previous day, the NPI can - not directly be activated or lifted that day but only on the next day. Hence + case reporting that can only happen after the day has finished (a 'delay' of one + day is introduced here) and as these reports generally appeared in the morning + for the previous day, the NPI was not directly be activated or lifted that same + day but only on the next day (another delay of one day). Hence, the incidence-dependent NPI is activated or lifted two days after the threshold - is/ is not exceeded. Please see the examples for a better understanding. + is/is not anymore exceeded, additionally considering the number of consecutive + days to implement or lift (see second paragraph above). + + Please see the examples for a better understanding. Example (Threshold=3.5): local_incid=pd.Series([2, 4, 2, 4, 2, 2, 4, 4, 2, 4, 2, 2, 2, 2]) @@ -292,19 +296,20 @@ def activate_npis_based_on_incidence( With yesterdays incidence over threshold on days: [0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0] With npi_lifting_days_threshold=2, npi_activation_days_threshold=1 - NPI should be activated on days 4 and 9 and lifted on days 8 and 14 - int_active should then be: + NPI should be activated on days 4 and 9 and lifted on days 8 and 14, i.e., + int_active then is: [0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0] With npi_lifting_days_threshold=3, npi_activation_days_threshold=2 - NPI should be activated on day 9 (and lifted on day 15; not in the vector) - [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] + NPI will be activated on day 10 (and would be lifted on day 15; + which is not in the vector anymore), i.e., int_active then is: + [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1] Another example: With yesterday's incidence over threshold on days: [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0] npi_lifting_days_threshold=3, npi_activation_days_threshold=1 - NPI should be activated on day 2 and lifted on day 14 - int_active should then be: + NPI will be activated on day 2 and lifted on day 14 + int_active then be is: [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0] Please also note that the first column will always returned as false @@ -334,7 +339,8 @@ def activate_npis_based_on_incidence( # loop over every day for i in range(len(yesterdays_incid_over_threshold)): # Set int_active=0 where last npi_lifting_days_threshold+1 days did not exceed - # the threshold + # the threshold. Look only until the day before yesterday (max(...):i is exclusive sum for i) + # as we assume a necessary delay of 24h to implement an intervention (see explanation above) if yesterdays_incid_over_threshold[max(0, i-npi_lifting_days_threshold):i].values.sum() == 0: int_active[i] = 0 # Set int_active=1 where last npi_activation_days_threshold+1 days did From 4eccc02d1058890000378790f36198f921db91f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Tue, 8 Aug 2023 13:51:37 +0200 Subject: [PATCH 087/104] comments improved --- .../memilio/epidata/getNPIData.py | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index ec7c1a0b64..d403b9113b 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -284,7 +284,7 @@ def activate_npis_based_on_incidence( Please see the examples for a better understanding. - Example (Threshold=3.5): + Example 1 (Threshold=3.5): local_incid=pd.Series([2, 4, 2, 4, 2, 2, 4, 4, 2, 4, 2, 2, 2, 2]) Yesterdays incidence is over the threshold on following days: [?, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0] @@ -295,21 +295,21 @@ def activate_npis_based_on_incidence( With yesterdays incidence over threshold on days: [0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0] - With npi_lifting_days_threshold=2, npi_activation_days_threshold=1 - NPI should be activated on days 4 and 9 and lifted on days 8 and 14, i.e., + Example 1a) ... and npi_lifting_days_threshold=2, npi_activation_days_threshold=1, + the NPI will be activated on days 4 and 9 and lifted on days 8 and 14, i.e., int_active then is: [0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0] - With npi_lifting_days_threshold=3, npi_activation_days_threshold=2 - NPI will be activated on day 10 (and would be lifted on day 15; + Example 1b) ... and npi_lifting_days_threshold=3, npi_activation_days_threshold=2, + the NPI will be activated on day 10 (and would be lifted on day 15; which is not in the vector anymore), i.e., int_active then is: [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1] - Another example: + Example 2: With yesterday's incidence over threshold on days: [1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0] - npi_lifting_days_threshold=3, npi_activation_days_threshold=1 - NPI will be activated on day 2 and lifted on day 14 - int_active then be is: + and npi_lifting_days_threshold=3, npi_activation_days_threshold=1, + the NPI will be activated on day 2 and lifted on day 14, i.e., + int_active then is: [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0] Please also note that the first column will always returned as false @@ -514,11 +514,13 @@ def get_npi_data(fine_resolution=2, print('Download completed.') + # Compute column index of NPI start (columns with NPIs start with days + # which are provided in format dYYYYMMDD). npi_start_col = np.where( df_npis_old.columns.str.contains('d2') == True)[0][0] - # get existing codes that are used - # for fine resolution we don't have codes M22 - M24 but are still listed in description + # get existing codes that are used; for fine resolution we don't + # have codes M22 - M24 but these are still listed in description if fine_resolution > 0: # count how many codes contain M22, M23 or M24 num_nonexistent_codes = df_npis_desc['Variablenname'].str.count( From cb4a6112e1ef23ee1b4e31f02960e4dd696c8fe1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Wed, 9 Aug 2023 17:49:36 +0200 Subject: [PATCH 088/104] real removal of column zero and adapting npi combi stuff --- pycode/memilio-epidata/memilio/epidata/getNPIData.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index d403b9113b..a374e5d1af 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -565,14 +565,13 @@ def get_npi_data(fine_resolution=2, # drop 0 column if existent try: - df_npis_combinations_pre.drop(columns=0) + df_npis_combinations_pre.drop(columns='Unnamed: 0', inplace=True) except KeyError: pass # rename essential columns and throw away others columns_used = np.where( (df_npis_combinations_pre == 'x').any() == True)[0] - column_names = [ - 'Unnamed: ' + str(i) for i in range(columns_used[0], columns_used[-1]+1)] + column_names = list(df_npis_combinations_pre.columns[[i for i in columns_used]]) rename_columns = {column_names[i]: i for i in range(len(column_names))} df_npis_combinations_pre.rename(columns=rename_columns, inplace=True) df_npis_combinations_pre = df_npis_combinations_pre[[ From 883f67daca14bcf83dda119b7c225e5ce64d52b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=BChn?= Date: Thu, 10 Aug 2023 17:00:41 +0200 Subject: [PATCH 089/104] minimal changes --- .../memilio/epidata/getNPIData.py | 23 ++++++++----------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index a374e5d1af..2f655b28f8 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -565,7 +565,7 @@ def get_npi_data(fine_resolution=2, # drop 0 column if existent try: - df_npis_combinations_pre.drop(columns='Unnamed: 0', inplace=True) + df_npis_combinations_pre.drop(columns=0, inplace=True) except KeyError: pass # rename essential columns and throw away others @@ -593,11 +593,6 @@ def get_npi_data(fine_resolution=2, npi_groups_combinations [npi_groups_combinations == code].index)) - # TODO: look at: - # np.where(df_npis_old[df_npis_old.NPI_code.isin(pd.Series(['M16_100', 'M16_100_1', 'M16_100_2', 'M16_100_3', 'M16_100_4', 'M16_100_5']))].iloc[:,npi_start_col:]==1) - # np.where(df_npis_old[df_npis_old.NPI_code.isin(pd.Series(['M01a_020', 'M01a_020_1', 'M01a_020_2', 'M01a_020_3', 'M01a_020_4', 'M01a_020_5']))].iloc[:,npi_start_col:]==1) - # np.where(df_npis_old[df_npis_old.NPI_code.isin(pd.Series(['M01b_020', 'M01b_020_1', 'M01b_020_2', 'M01b_020_3', 'M01b_020_4', 'M01b_020_5']))].iloc[:,npi_start_col:]==1) - # create hash table of main code to strictness rankings inside main # code and combination matrix inside the same strictness rank df_npis_combinations = { @@ -1234,19 +1229,19 @@ def get_npi_data(fine_resolution=2, def count_code_multiplicities_init(df_npis_old, df_count, counties_considered): - """! Count multiply for all pairs of NPI codes how many times they were + """! Count for all pairs of NPI codes how many times they were mentioned at the same day in the initial data frame. - @param df_npis_old Initial data frame read from Corona Datenplattform. - @param df_count Dictionnary of main NPI codes with empty interaction + @param[in] df_npis_old Initial data frame read from Corona Datenplattform. + @param[in,out] df_count Dictionnary of main NPI codes with empty interaction matrix (to be filled) for all codes under main code in df_count[maincode][1] - @param counties_considered County IDs for which initial data frame is + @param[in] counties_considered County IDs for which initial data frame is considered. """ for county in counties_considered: df_local = df_npis_old[df_npis_old[dd.EngEng['idCounty']] == county] # get column where dates start - npi_date_start_col = np.where( + npi_start_col = np.where( df_local.columns.str.startswith('d2') == True)[0][0] # prepare dictionnary for dates when code was mentioned code_dates = {} @@ -1258,14 +1253,14 @@ def count_code_multiplicities_init(df_npis_old, df_count, counties_considered): # get dates where NPI is mentioned as existing in potential intervention set npi_rows = df_local.NPI_code.str.contains(code_list[code_idx]) npi_dates_in_df = np.where( - df_local[npi_rows].iloc[:, npi_date_start_col:].max() > 0)[0] + df_local[npi_rows].iloc[:, npi_start_col:].max() > 0)[0] # store non-transforemed dates in code_dict code_dates[code_list[code_idx]] = df_local.iloc[:, - npi_date_start_col + npi_dates_in_df].columns + npi_start_col + npi_dates_in_df].columns # count number of multiply mentionned NPIs with different incidence thresholds for the same day df_count[maincode][1].iloc[code_idx, code_idx] += df_local[npi_rows].iloc[:, - npi_date_start_col + npi_dates_in_df].sum().sum() - len(npi_dates_in_df) + npi_start_col + npi_dates_in_df].sum().sum() - len(npi_dates_in_df) # no diag for maincode in df_count.keys(): From eeb9288c9b1056400165b75bcc1dde7458b24ead Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Mon, 14 Aug 2023 17:43:10 +0200 Subject: [PATCH 090/104] comments --- pycode/memilio-epidata/memilio/epidata/getNPIData.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 2f655b28f8..000b98a76c 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -1246,6 +1246,7 @@ def count_code_multiplicities_init(df_npis_old, df_count, counties_considered): # prepare dictionnary for dates when code was mentioned code_dates = {} # run through all maincodes (i.e., first 3-4 characters like M01a or M11) + # diagonal entries for maincode in df_count.keys(): code_list = df_count[maincode][1].columns # iterate over code/row indices 0 to n @@ -1262,7 +1263,7 @@ def count_code_multiplicities_init(df_npis_old, df_count, counties_considered): df_count[maincode][1].iloc[code_idx, code_idx] += df_local[npi_rows].iloc[:, npi_start_col + npi_dates_in_df].sum().sum() - len(npi_dates_in_df) - # no diag + # offdiagonal entries (as before, use that code_dates has been filled for all diagonal entries, i.e., all codes) for maincode in df_count.keys(): code_list = df_count[maincode][1].columns # iterate over rows in matrix df_count with code/row indices 0 to n From 8740035abb3d50ec9f11f091aa75d644c1e51128 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=BChn?= Date: Tue, 15 Aug 2023 14:49:09 +0200 Subject: [PATCH 091/104] plotting and saving --- .../memilio/epidata/getNPIData.py | 74 ++++++++++++------- 1 file changed, 49 insertions(+), 25 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 000b98a76c..6c02f2e2f1 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -892,8 +892,8 @@ def get_npi_data(fine_resolution=2, df_count_joint_codes[maincode][1] *= 0 df_counted_joint_codes = count_code_multiplicities_init(df_npis_old, df_count_joint_codes, counties_considered=counties_considered) - save_counter(df_counted_joint_codes, 'joint_codes', directory) - plot_counter('joint_codes', directory) + save_interaction_matrix(df_counted_joint_codes, 'joint_codes', directory) + plot_interaction_matrix('joint_codes', directory) plot_multiple_prescriptions('joint_codes', directory) # create dataframe to count multiple codes after incidence dependent (de-)activation @@ -1165,8 +1165,8 @@ def get_npi_data(fine_resolution=2, '. Estimated time remaining: ' + str(int(time_remain / 60)) + ' min.') - save_counter(df_count_deactivation, 'count_deactivation', directory) - plot_counter('count_deactivation', directory) + save_interaction_matrix(df_count_deactivation, 'count_deactivation', directory) + plot_interaction_matrix('count_deactivation', directory) if counter_cases_start >= len(counties_considered)*0.05: print('WARNING: DataFrame starts with reported cases > 0 ' @@ -1176,11 +1176,11 @@ def get_npi_data(fine_resolution=2, 'Please consider a start date of some weeks ahead of the ' 'time window to be analyzed for NPI\'s effects.') - save_counter(df_count_incid_depend, 'joint_codes_incid_depend', directory) - plot_counter('joint_codes_incid_depend', directory) + save_interaction_matrix(df_count_incid_depend, 'joint_codes_incid_depend', directory) + plot_interaction_matrix('joint_codes_incid_depend', directory) - save_counter(df_count_active, 'joint_codes_active', directory) - plot_counter('joint_codes_active', directory) + save_interaction_matrix(df_count_active, 'joint_codes_active', directory) + plot_interaction_matrix('joint_codes_active', directory) # print sub counters print('Sub task counters are: ') @@ -1234,7 +1234,7 @@ def count_code_multiplicities_init(df_npis_old, df_count, counties_considered): @param[in] df_npis_old Initial data frame read from Corona Datenplattform. @param[in,out] df_count Dictionnary of main NPI codes with empty interaction - matrix (to be filled) for all codes under main code in df_count[maincode][1] + matrix (to be filled) for all codes under main code in df_count[maincode][1]. @param[in] counties_considered County IDs for which initial data frame is considered. """ @@ -1299,40 +1299,62 @@ def count_codes(df_old, df_count, county): return df_count -def save_counter(df_count, filename, directory): - # save results +def save_interaction_matrix(df_interactions, filename, directory): + """! Saves interaction matrices for all subcodes in provided main codes. + + @param[in] df_interactions Dictionnary of main NPI codes with interaction + matrix for all subcodes under main code in df_interactions[maincode][1]. + @param[in] filename Filename to store result. + @param[in] directory Directory where to save data. + """ writer = pd.ExcelWriter( os.path.join(directory, filename + '.xlsx'), engine='xlsxwriter') - for code in df_count.keys(): - df_count[code][1].to_excel(writer, sheet_name=code) + for code in df_interactions.keys(): + df_interactions[code][1].to_excel(writer, sheet_name=code) writer.close() # saves plot in folder directory/heatmaps_filename -def plot_counter(filename, directory): +def plot_interaction_matrix(filename, directory): + """! Reads interaction matrices from hard drive and writes heatmap plots + to hard drive. + + @param[in] filename Filename to read results from. + @param[in] directory Directory where to read and save data. + """ target_directory = os.path.join(directory, 'heatmaps_' + filename) if not os.path.exists(target_directory): os.makedirs(target_directory) - codelist = pd.ExcelFile(os.path.join( - directory, filename + '.xlsx'), engine='openpyxl').sheet_names + try: + codelist = pd.ExcelFile(os.path.join( + directory, filename + '.xlsx'), engine='openpyxl').sheet_names + except: + raise FileNotFoundError('File ' + filename + ' not found.') - cmap = copy.copy(mpl.cm.get_cmap('OrRd')) + # invert color map elements for tab20c such that subcolors are shown + # from light to dark + cmap = copy.copy(mpl.cm.get_cmap('tab20b')) + colors = [cmap(i) for i in np.array([list(range(4*(i+1)-1,4*i-1,-1)) for i in range(5)]).flatten()] + colors = colors + [(0.6, 0.6, 0.6), (0.4, 0.4, 0.4), + (0.2, 0.2, 0.2), (0, 0, 0)] + cmap = mpl.colors.ListedColormap(colors) for code in codelist: df = pd.read_excel( os.path.join(directory, filename + '.xlsx'), sheet_name=code, engine='openpyxl') - # set diag = 0 + # set diag = 0, access (i,i+1) as first column contains index for i in range(df.shape[0]): df.iloc[i, i+1] = 0 + # remove first column and convert to numpy array array_exclusion = df.iloc[:, 1:].to_numpy() if filename != 'count_deactivation': # for count deactivation xlabel != ylabel - # else matrix is of squared form + # else matrix is of squared form and symmetric array_exclusion += np.transpose(array_exclusion) positions = [i for i in range(len(df.columns)-1)] plt.xticks(positions, df.columns.to_list()[1:], rotation='vertical') @@ -1356,15 +1378,17 @@ def plot_counter(filename, directory): plt.ylabel('NPI') plt.title('Joint NPI prescriptions') else: - raise gd.DataError('unknown filename: '+filename) + raise gd.DataError('Unknown filename: ' + filename) - # set vmin = 1 so that only combinations that are simultaneously active at least on one day are in colour, - # else white - # set vmax = 300000, this should be larger than maxima in all dataframes, - # this way colours of heatmaps are comparable (e.g. between codes or between joint_codes and exclusions) + # Set vmin = 1 so that only combinations that are simultaneously active + # at least on one day are in color, else use white. + # Set vmax = 1e6 to be adjusted with colormap, this value is larger + # than the maximum in all dataframes, this way colors of heatmaps are + # comparable across different visualizations + # (e.g. between codes or between joint_codes and exclusions) plt.imshow(array_exclusion, cmap=cmap, - norm=mpl.colors.LogNorm(vmin=1, vmax=300000)) + norm=mpl.colors.LogNorm(vmin=1, vmax=1e6)) plt.colorbar() plt.tight_layout() plt.savefig( From 1e94e1c5a9564efd7011f56b4fad8b6359edb03b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=BChn?= Date: Wed, 16 Aug 2023 09:26:15 +0200 Subject: [PATCH 092/104] combine functions --- .../memilio/epidata/getNPIData.py | 55 +++++++++---------- 1 file changed, 25 insertions(+), 30 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 6c02f2e2f1..429cd01947 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -894,7 +894,6 @@ def get_npi_data(fine_resolution=2, counties_considered=counties_considered) save_interaction_matrix(df_counted_joint_codes, 'joint_codes', directory) plot_interaction_matrix('joint_codes', directory) - plot_multiple_prescriptions('joint_codes', directory) # create dataframe to count multiple codes after incidence dependent (de-)activation df_incid_depend = pd.DataFrame() @@ -1320,7 +1319,10 @@ def save_interaction_matrix(df_interactions, filename, directory): def plot_interaction_matrix(filename, directory): """! Reads interaction matrices from hard drive and writes heatmap plots - to hard drive. + to hard drive. Separates diagonal and offdiagonal entries as + interactions inside one NPI are counted for all incidence dependent + sublevels while between NPIs only one interaction is counted if more + than one sublevel is mentioned on each of the sides. @param[in] filename Filename to read results from. @param[in] directory Directory where to read and save data. @@ -1338,7 +1340,11 @@ def plot_interaction_matrix(filename, directory): # invert color map elements for tab20c such that subcolors are shown # from light to dark cmap = copy.copy(mpl.cm.get_cmap('tab20b')) - colors = [cmap(i) for i in np.array([list(range(4*(i+1)-1,4*i-1,-1)) for i in range(5)]).flatten()] + colors = [ + cmap(i) + for i in np.array( + [list(range(4 * (i + 1) - 1, 4 * i - 1, -1)) for i in + range(5)]).flatten()] colors = colors + [(0.6, 0.6, 0.6), (0.4, 0.4, 0.4), (0.2, 0.2, 0.2), (0, 0, 0)] cmap = mpl.colors.ListedColormap(colors) @@ -1347,11 +1353,16 @@ def plot_interaction_matrix(filename, directory): df = pd.read_excel( os.path.join(directory, filename + '.xlsx'), sheet_name=code, engine='openpyxl') - # set diag = 0, access (i,i+1) as first column contains index - for i in range(df.shape[0]): - df.iloc[i, i+1] = 0 + # remove first column and convert to numpy array array_exclusion = df.iloc[:, 1:].to_numpy() + + # separate diag and offdiag + array_exclusion_diag = copy.deepcopy(array_exclusion.diagonal()) + # set diag = 0 + for i in range(array_exclusion.shape[0]): + array_exclusion[i, i] = 0 + if filename != 'count_deactivation': # for count deactivation xlabel != ylabel # else matrix is of squared form and symmetric @@ -1380,50 +1391,34 @@ def plot_interaction_matrix(filename, directory): else: raise gd.DataError('Unknown filename: ' + filename) + ## plot offdiagonal (interactions between NPIs) # Set vmin = 1 so that only combinations that are simultaneously active # at least on one day are in color, else use white. # Set vmax = 1e6 to be adjusted with colormap, this value is larger # than the maximum in all dataframes, this way colors of heatmaps are # comparable across different visualizations # (e.g. between codes or between joint_codes and exclusions) - plt.imshow(array_exclusion, cmap=cmap, norm=mpl.colors.LogNorm(vmin=1, vmax=1e6)) plt.colorbar() plt.tight_layout() plt.savefig( - os.path.join(target_directory, filename + '_{}'.format( + os.path.join(target_directory, 'InterNPIs_' + filename + '_{}'.format( code)), dpi=300) plt.close() - -def plot_multiple_prescriptions(filename, directory): - target_directory = os.path.join( - directory, 'heatmaps_mult_presc_' + filename) - if not os.path.exists(target_directory): - os.makedirs(target_directory) - - codelist = pd.ExcelFile(os.path.join( - directory, filename + '.xlsx'), engine='openpyxl').sheet_names - - cmap = copy.copy(mpl.cm.get_cmap('OrRd')) - - for code in codelist: - df = pd.read_excel( - os.path.join(directory, filename + '.xlsx'), - sheet_name=code, engine='openpyxl') - array_exclusion = df.iloc[:, 1:].to_numpy() - fig = plt.figure() + ## plot diagonal (interactions between incidence levels of one NPIs) + plt.figure() positions = [i for i in range(len(df.columns)-1)] plt.yticks(positions, df.columns.to_list()[1:]) plt.xticks([]) - plt.imshow(np.array([array_exclusion.diagonal()]).T, - cmap=cmap, norm=mpl.colors.LogNorm(vmin=1, vmax=50000)) + plt.imshow(np.array([array_exclusion_diag]).T, + cmap=cmap, norm=mpl.colors.LogNorm(vmin=1, vmax=1e6)) plt.colorbar() - plt.title(code) + plt.title('Intra-NPI duplicates') plt.tight_layout() plt.savefig( - os.path.join(target_directory, filename + '_{}'.format( + os.path.join(target_directory, 'IntraNPIs_' + filename + '_{}'.format( code)), dpi=300) plt.close() From 08cf07350cc5330705d312c705f3f94dbd3a10c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Fri, 18 Aug 2023 14:55:42 +0200 Subject: [PATCH 093/104] more comments and checks --- .../memilio/epidata/getNPIData.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 429cd01947..f6d23cc927 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -915,9 +915,10 @@ def get_npi_data(fine_resolution=2, all_subcodes = [] for maincode in df_npis_combinations.keys(): all_subcodes += df_npis_combinations[maincode][1].columns.to_list() + # check (and validate) that element 0 and 1 in df_npis_combination match. if df_npis_combinations[maincode][1].columns.to_list() != list( df_npis_combinations[maincode][0].keys()): - raise gd.DataError('Error') + raise gd.DataError('Error. Description and table do not match.') for countyID in counties_considered: cid = 0 @@ -960,6 +961,8 @@ def get_npi_data(fine_resolution=2, df_local_old = copy.deepcopy(df_npis_old[df_npis_old[dd.EngEng['idCounty']] == countyID]) + # get number of codes of one NPI (incidence indep. + dep.) + # for fine_resolution=1, inc_codes=1, for fine_res=2, inc_codes=6 inc_codes = len(np.where(df_npis.columns.str.contains( npis[dd.EngEng['npiCode']][0]))[0]) @@ -967,10 +970,20 @@ def get_npi_data(fine_resolution=2, # The same NPI should not be prescribed multiple times at the same day # for different thresholds. In order to avoid contradictions, only # retain the strictest mentioned implementation. + print_details = True # define if details are printed (probably to be deactivated) for i in range(int(len(df_local_old)/inc_codes)): + + # check if access is correct + if not all( + [npis[dd.EngEng['npiCode']][i * inc_codes] in npi_code_test + for npi_code_test in df_local_old.iloc + [inc_codes * i: inc_codes * (i + 1), + npi_start_col - 1].to_list()]): + raise gd.DataError('Wrong NPI rows aggregated.') + sum_npi_inc = np.where( df_local_old.iloc[inc_codes*i:inc_codes*(i+1), npi_start_col:].sum() > 1) - if len(sum_npi_inc[0]): + if len(sum_npi_inc[0]) and print_details: print( 'Reduce multiple prescription in county ' + str(countyID) + ' for NPI ' + str(npis.loc[inc_codes*i, 'Description'])) From e4242ed469934d3edd0489616345cbc73e32b302 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Tue, 29 Aug 2023 10:36:18 +0200 Subject: [PATCH 094/104] comments and slight design adjustments --- .../memilio/epidata/getNPIData.py | 36 +++++++++++++------ 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index f6d23cc927..5502ee7fac 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -924,6 +924,7 @@ def get_npi_data(fine_resolution=2, cid = 0 countyidx += 1 + ## compute incidence for given county and store in other data frame if fine_resolution > 0: # compute incidence based on previous data frames df_infec_local = copy.deepcopy( @@ -950,7 +951,6 @@ def get_npi_data(fine_resolution=2, df_infec_local = df_infec_local[(df_infec_local[dd.EngEng['date']] >= start_date_new) & ( df_infec_local[dd.EngEng['date']] <= end_date_new)].reset_index() - local_incid = copy.deepcopy(df_infec_local['Incidence']) # Count counties with start cases >= 1: # In this case NPI activation cannot be ensured to work as expected if cases_first_value >= 1: @@ -966,10 +966,12 @@ def get_npi_data(fine_resolution=2, inc_codes = len(np.where(df_npis.columns.str.contains( npis[dd.EngEng['npiCode']][0]))[0]) - # Consistency of incidence dependent NPIs: + # Consistency of incidence independent and dependent NPIs: # The same NPI should not be prescribed multiple times at the same day - # for different thresholds. In order to avoid contradictions, only - # retain the strictest mentioned implementation. + # for different incidence-dependent thresholds or incidence-independently. + # In order to avoid contradictions, only retain the strictest mentioned + # implementation. Incidence-independent is always stricter than any + # incidence-dependent implementation. print_details = True # define if details are printed (probably to be deactivated) for i in range(int(len(df_local_old)/inc_codes)): @@ -983,7 +985,7 @@ def get_npi_data(fine_resolution=2, sum_npi_inc = np.where( df_local_old.iloc[inc_codes*i:inc_codes*(i+1), npi_start_col:].sum() > 1) - if len(sum_npi_inc[0]) and print_details: + if (len(sum_npi_inc[0]) > 0) and print_details: print( 'Reduce multiple prescription in county ' + str(countyID) + ' for NPI ' + str(npis.loc[inc_codes*i, 'Description'])) @@ -1058,6 +1060,9 @@ def get_npi_data(fine_resolution=2, npis_idx_start = list( df_local_new.columns).index( npis[dd.EngEng['npiCode']][0]) + + # extract local incidence from local frame + local_incid = copy.deepcopy(df_infec_local['Incidence']) # iterate through all NPIs and activate if incidence threshold # is exceeded @@ -1080,6 +1085,8 @@ def get_npi_data(fine_resolution=2, # merge incidence dependent NPIs to have only one column for each subcode df_merged = df_local_new.iloc[:, :2].copy() for subcode in all_subcodes: + # extract columns which have the subcode as part of the column + # name and sum over all these subcodes df_merged[subcode] = df_local_new.filter( regex=subcode).sum(axis=1) # strictness deactivation is done with this merged dataframe @@ -1087,17 +1094,20 @@ def get_npi_data(fine_resolution=2, df_incid_depend = pd.concat( [df_incid_depend, copy.deepcopy(df_merged)]) - if df_merged.max()[2:].max() > 1: + if df_merged.iloc[:,2:].max().max() > 1: raise gd.DataError('Error in merging...') - # Remove conflicting NPIs according to strictness index of Corona-Datenplattform and exclusion criteria - # defined in df_npis_combinations + ## Remove conflicting NPIs according to strictness index of Corona- + ## Datenplattform and exclusion criteria defined in df_npis_combinations for maincode in df_npis_combinations.keys(): # get all subcodes subcodes = list(df_npis_combinations[maincode][0].keys()) + subcodes_strictness_values = list(df_npis_combinations[maincode][0].values()) + if len(subcodes) != len(subcodes_strictness_values): + raise gd.DataError('Subcode and strictness array inconsistent.') # sort index reversely with the strictest (highest) index first idx_strictness_sorted_rev = np.argsort( - list(df_npis_combinations[maincode][0].values()))[::-1] + subcodes_strictness_values)[::-1] for i in range(len(idx_strictness_sorted_rev)-1): # get index of NPI of a certain strictness idx_strictness = idx_strictness_sorted_rev[i] @@ -1116,14 +1126,15 @@ def get_npi_data(fine_resolution=2, # only consider those codes which cannot be combined; # for these, values of 1 have to be set to 0 subcodes_nocombi = list( - subcodes_nocombi - [subcodes_nocombi == 0].index) + subcodes_nocombi[subcodes_nocombi == 0].index) # intersect non-combinable subcodes with less strict subcodes subcodes_deactivation = np.sort( list(set(codes_less_strict).intersection(subcodes_nocombi))) for nocombi_code in subcodes_deactivation: + # check where the less strict NPI is mentioned, only + # considering rows where the stricter NPI is mentioned. days_deact = np.where( df_merged.loc[subcode_active, nocombi_code] > 0)[0] if len(days_deact) > 0: @@ -1132,6 +1143,9 @@ def get_npi_data(fine_resolution=2, print('\t' + str(nocombi_code) + ' due to ' + str(subcode) + ' on ' + str(len(days_deact)) + ' days.') print('\n') + # take subcode_active rows as days_deact is + # numbering inside subcode_active rows only, + # not numbering on the whole df_merge data frame df_merged.loc[subcode_active, nocombi_code] = 0 df_count_deactivation[maincode][1].loc[idx_strictness, nocombi_code] += len(days_deact) From ab5e1530e9dd4bd8bb7d36bc1b8097ac0311ce6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=BChn?= Date: Fri, 1 Sep 2023 17:54:25 +0200 Subject: [PATCH 095/104] renaming --- .../memilio/epidata/getNPIData.py | 43 +++++++++++-------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 5502ee7fac..2cea852542 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -1083,18 +1083,18 @@ def get_npi_data(fine_resolution=2, = df_local_new.iloc[:, npis_idx_start + np.array(npi_indices)].mul(int_active, axis=0) # merge incidence dependent NPIs to have only one column for each subcode - df_merged = df_local_new.iloc[:, :2].copy() + df_local_new_merged = df_local_new.iloc[:, :2].copy() for subcode in all_subcodes: # extract columns which have the subcode as part of the column # name and sum over all these subcodes - df_merged[subcode] = df_local_new.filter( + df_local_new_merged[subcode] = df_local_new.filter( regex=subcode).sum(axis=1) # strictness deactivation is done with this merged dataframe df_incid_depend = pd.concat( - [df_incid_depend, copy.deepcopy(df_merged)]) + [df_incid_depend, copy.deepcopy(df_local_new_merged)]) - if df_merged.iloc[:,2:].max().max() > 1: + if df_local_new_merged.iloc[:,2:].max().max() > 1: raise gd.DataError('Error in merging...') ## Remove conflicting NPIs according to strictness index of Corona- @@ -1115,7 +1115,7 @@ def get_npi_data(fine_resolution=2, subcode = subcodes[idx_strictness] # get indices of days where subcode is active - subcode_active = np.where(df_merged.loc[:, subcode] > 0)[0] + subcode_active = np.where(df_local_new_merged.loc[:, subcode] > 0)[0] if len(subcode_active) > 0: # get indices of less strict NPIs @@ -1136,7 +1136,7 @@ def get_npi_data(fine_resolution=2, # check where the less strict NPI is mentioned, only # considering rows where the stricter NPI is mentioned. days_deact = np.where( - df_merged.loc[subcode_active, nocombi_code] > 0)[0] + df_local_new_merged.loc[subcode_active, nocombi_code] > 0)[0] if len(days_deact) > 0: print('Deactivating for ' + 'County ' + str(countyID)) @@ -1145,26 +1145,27 @@ def get_npi_data(fine_resolution=2, print('\n') # take subcode_active rows as days_deact is # numbering inside subcode_active rows only, - # not numbering on the whole df_merge data frame - df_merged.loc[subcode_active, nocombi_code] = 0 + # not numbering on the whole df_local_new_merged + # data frame + df_local_new_merged.loc[subcode_active, nocombi_code] = 0 df_count_deactivation[maincode][1].loc[idx_strictness, nocombi_code] += len(days_deact) # count joint codes from after strictness based deactivation - df_count_active = count_codes(df_merged, df_count_active, countyID) + df_count_active = count_code_multiplicities_merged(df_local_new_merged, df_count_active, countyID) # count joint codes from after incidence based activation - df_count_incid_depend = count_codes( + df_count_incid_depend = count_code_multiplicities_merged( df_incid_depend, df_count_incid_depend, countyID) # for fine resolution = 1 only consider merged dataframe if fine_resolution == 1: - df_local_new = df_merged.copy() + df_local_new = df_local_new_merged.copy() else: # multiply subcode columns with incidence dependent subcode columns in df_local_new for subcode in all_subcodes: for incidcode in ['', '_1', '_2', '_3', '_4', '_5']: - df_local_new[subcode+incidcode] *= df_merged[subcode] + df_local_new[subcode+incidcode] *= df_local_new_merged[subcode] counters[cid] += time.perf_counter()-start_time cid += 1 @@ -1254,18 +1255,18 @@ def get_npi_data(fine_resolution=2, return df_npis -def count_code_multiplicities_init(df_npis_old, df_count, counties_considered): +def count_code_multiplicities_init(df_npis_input, df_count, counties_considered): """! Count for all pairs of NPI codes how many times they were mentioned at the same day in the initial data frame. - @param[in] df_npis_old Initial data frame read from Corona Datenplattform. + @param[in] df_npis_input Initial data frame read from Corona Datenplattform. @param[in,out] df_count Dictionnary of main NPI codes with empty interaction matrix (to be filled) for all codes under main code in df_count[maincode][1]. @param[in] counties_considered County IDs for which initial data frame is considered. """ for county in counties_considered: - df_local = df_npis_old[df_npis_old[dd.EngEng['idCounty']] == county] + df_local = df_npis_input[df_npis_input[dd.EngEng['idCounty']] == county] # get column where dates start npi_start_col = np.where( df_local.columns.str.startswith('d2') == True)[0][0] @@ -1304,8 +1305,16 @@ def count_code_multiplicities_init(df_npis_old, df_count, counties_considered): return df_count -def count_codes(df_old, df_count, county): - df_local = df_old[df_old[dd.EngEng['idCounty']] == county] +def count_code_multiplicities_merged(df_npis_input, df_count, county): + """! Count for all pairs of NPI codes how many times they were + mentioned or active at the same day in a transformed data frame. + + @param[in] df_npis_input Initial data frame read from Corona Datenplattform. + @param[in,out] df_count Dictionnary of main NPI codes with empty interaction + matrix (to be filled) for all codes under main code in df_count[maincode][1]. + @param[in] County CountyID with which input data frame df_count is changed. + """ + df_local = df_npis_input[df_npis_input[dd.EngEng['idCounty']] == county] code_dict = {} for maincode in df_count.keys(): for column in df_count[maincode][1].columns: From 40a90598d82197a763ec3a8403358e97ec008b54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Fri, 1 Sep 2023 19:06:16 +0200 Subject: [PATCH 096/104] count code functions reduced --- .../memilio/epidata/getNPIData.py | 78 ++++++++----------- 1 file changed, 32 insertions(+), 46 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 2cea852542..46696419bb 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -890,7 +890,7 @@ def get_npi_data(fine_resolution=2, df_count_joint_codes = copy.deepcopy(df_npis_combinations) for maincode in df_count_joint_codes.keys(): df_count_joint_codes[maincode][1] *= 0 - df_counted_joint_codes = count_code_multiplicities_init(df_npis_old, df_count_joint_codes, + df_counted_joint_codes = count_code_multiplicities(df_npis_old, df_count_joint_codes, counties_considered=counties_considered) save_interaction_matrix(df_counted_joint_codes, 'joint_codes', directory) plot_interaction_matrix('joint_codes', directory) @@ -1152,11 +1152,11 @@ def get_npi_data(fine_resolution=2, nocombi_code] += len(days_deact) # count joint codes from after strictness based deactivation - df_count_active = count_code_multiplicities_merged(df_local_new_merged, df_count_active, countyID) + df_count_active = count_code_multiplicities(df_local_new_merged, df_count_active, [countyID], False) # count joint codes from after incidence based activation - df_count_incid_depend = count_code_multiplicities_merged( - df_incid_depend, df_count_incid_depend, countyID) + df_count_incid_depend = count_code_multiplicities( + df_incid_depend, df_count_incid_depend, [countyID], False) # for fine resolution = 1 only consider merged dataframe if fine_resolution == 1: @@ -1255,7 +1255,7 @@ def get_npi_data(fine_resolution=2, return df_npis -def count_code_multiplicities_init(df_npis_input, df_count, counties_considered): +def count_code_multiplicities(df_npis_input, df_count, counties_considered, initial_data_frame=True): """! Count for all pairs of NPI codes how many times they were mentioned at the same day in the initial data frame. @@ -1264,6 +1264,8 @@ def count_code_multiplicities_init(df_npis_input, df_count, counties_considered) matrix (to be filled) for all codes under main code in df_count[maincode][1]. @param[in] counties_considered County IDs for which initial data frame is considered. + @param[in] initial_data_frame Defines where initial data frame structure is + input. """ for county in counties_considered: df_local = df_npis_input[df_npis_input[dd.EngEng['idCounty']] == county] @@ -1278,17 +1280,30 @@ def count_code_multiplicities_init(df_npis_input, df_count, counties_considered) code_list = df_count[maincode][1].columns # iterate over code/row indices 0 to n for code_idx in range(len(code_list)): - # get dates where NPI is mentioned as existing in potential intervention set - npi_rows = df_local.NPI_code.str.contains(code_list[code_idx]) - npi_dates_in_df = np.where( - df_local[npi_rows].iloc[:, npi_start_col:].max() > 0)[0] - # store non-transforemed dates in code_dict - code_dates[code_list[code_idx]] = df_local.iloc[:, - npi_start_col + npi_dates_in_df].columns - - # count number of multiply mentionned NPIs with different incidence thresholds for the same day - df_count[maincode][1].iloc[code_idx, code_idx] += df_local[npi_rows].iloc[:, - npi_start_col + npi_dates_in_df].sum().sum() - len(npi_dates_in_df) + + # initial data frame (df_npis_old) and reworked new data frames + # are transposed (NPIs and dates in rows and columns switched) + if initial_data_frame: + # get dates where NPI is mentioned as existing in potential intervention set + npi_rows = df_local.NPI_code.str.contains(code_list[code_idx]) + npi_dates_in_df = np.where( + df_local[npi_rows].iloc[:, npi_start_col:].max() > 0)[0] + # store non-transformed dates in code_dict + code_dates[code_list[code_idx]] = df_local.iloc[:, + npi_start_col + npi_dates_in_df].columns + # count number of multiply mentionned NPIs with different incidence thresholds for the same day + df_count[maincode][1].iloc[code_idx, code_idx] += df_local[npi_rows].iloc[:, + npi_start_col + npi_dates_in_df].sum().sum() - len(npi_dates_in_df) + else: + # get dates where NPI is mentioned as existing in potential intervention set + npi_cols = df_local.columns.str.contains( + code_list[code_idx]) + npi_dates_in_df = np.where( + df_local.loc[:, npi_cols].max(axis=1) > 0)[0] + # store transformed dates in code_dict + code_dates[code_list[code_idx] + ] = df_local.iloc[npi_dates_in_df, 0].to_list() + # offdiagonal entries (as before, use that code_dates has been filled for all diagonal entries, i.e., all codes) for maincode in df_count.keys(): @@ -1303,36 +1318,7 @@ def count_code_multiplicities_init(df_npis_input, df_count, counties_considered) code_dates[code_list[code_idx]]).intersection(set(code_dates[code_list[code_idx_other]]))) return df_count - - -def count_code_multiplicities_merged(df_npis_input, df_count, county): - """! Count for all pairs of NPI codes how many times they were - mentioned or active at the same day in a transformed data frame. - - @param[in] df_npis_input Initial data frame read from Corona Datenplattform. - @param[in,out] df_count Dictionnary of main NPI codes with empty interaction - matrix (to be filled) for all codes under main code in df_count[maincode][1]. - @param[in] County CountyID with which input data frame df_count is changed. - """ - df_local = df_npis_input[df_npis_input[dd.EngEng['idCounty']] == county] - code_dict = {} - for maincode in df_count.keys(): - for column in df_count[maincode][1].columns: - code_dict[column] = df_local.iloc[np.where( - df_local.loc[:, df_local.columns.str.contains(column)].max(axis=1) > 0)[0], 0].to_list() - - # iterate over code/column indices 0 to code_idx-1 (not filling diagonal) - # Note that the upper diagonal part of the matrix does not - # need to be considered as matrix is symmetric. - for maincode in df_count.keys(): - column_list = df_count[maincode][1].columns - for column in range(len(column_list)): - for column_other in range(column): - df_count[maincode][1].iloc[column, column_other] += len(set( - code_dict[column_list[column]]).intersection(set(code_dict[column_list[column_other]]))) - - return df_count - + def save_interaction_matrix(df_interactions, filename, directory): """! Saves interaction matrices for all subcodes in provided main codes. From 45a7ff76da9c366a68db0251ba1b30b9630d8841 Mon Sep 17 00:00:00 2001 From: annawendler Date: Mon, 4 Sep 2023 10:53:03 +0200 Subject: [PATCH 097/104] consistency for copies --- .../memilio/epidata/getNPIData.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 46696419bb..f0d98d35a9 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -723,8 +723,8 @@ def get_npi_data(fine_resolution=2, key: val for key, val in df_npis_combinations[code][0].items() if key in npis['NPI_code'].values} # remove columns of combinations - df_npis_combinations[code][1] = df_npis_combinations[code][1].loc[local_codes_used_rows, - local_codes_used_cols].reset_index(drop=True).copy() + df_npis_combinations[code][1] = copy.deepcopy(df_npis_combinations[code][1].loc[local_codes_used_rows, + local_codes_used_cols].reset_index(drop=True)) # prepare grouping of NPIs to reduce product space of # NPI x active_from_inc (with values "incidence does not matter", and @@ -1083,7 +1083,7 @@ def get_npi_data(fine_resolution=2, = df_local_new.iloc[:, npis_idx_start + np.array(npi_indices)].mul(int_active, axis=0) # merge incidence dependent NPIs to have only one column for each subcode - df_local_new_merged = df_local_new.iloc[:, :2].copy() + df_local_new_merged = copy.deepcopy(df_local_new.iloc[:, :2]) for subcode in all_subcodes: # extract columns which have the subcode as part of the column # name and sum over all these subcodes @@ -1160,7 +1160,7 @@ def get_npi_data(fine_resolution=2, # for fine resolution = 1 only consider merged dataframe if fine_resolution == 1: - df_local_new = df_local_new_merged.copy() + df_local_new = copy.deepcopy(df_local_new_merged) else: # multiply subcode columns with incidence dependent subcode columns in df_local_new for subcode in all_subcodes: @@ -1173,10 +1173,9 @@ def get_npi_data(fine_resolution=2, start_time = time.perf_counter() - df_npis = pd.concat( - [df_npis.copy(), - df_local_new.copy()], - ignore_index=True) + df_npis = copy.deepcopy(pd.concat( + [df_npis, df_local_new], + ignore_index=True)) counters[cid] += time.perf_counter()-start_time cid += 1 @@ -1458,7 +1457,7 @@ def plot_activation(df_npis_old, df_final, directory, maincode, subcodes): plt.xticks(tick_range) plt.plot(mentions.index, mentions, c='firebrick', label=name + ' mentioned') - implementations = mentions[54:].copy() + implementations = copy.deepcopy(mentions[54:]) implementations *= 0 for county in counties: implementations += df_final[df_final.ID_County == From db2fb9c10f92f74dee2b9e5eedf361748d056c14 Mon Sep 17 00:00:00 2001 From: patricklnz Date: Mon, 4 Sep 2023 13:10:14 +0200 Subject: [PATCH 098/104] sanity checks and exception handling --- .../memilio/epidata/getNPIData.py | 71 ++++++++++--------- .../epidata_test/test_epidata_getNPIData.py | 8 +-- 2 files changed, 41 insertions(+), 38 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 46696419bb..f3a790a474 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -434,6 +434,35 @@ def drop_codes_and_categories( return codes_dropped, npi_codes_prior, df_npis_old +def npi_sanity_check(df_npis_old, df_npis_desc, df_npis_combinations_pre): + # Check if all counties are in df_npis_old + if not np.array_equal(df_npis_old.ID_County.unique().astype(int), np.array(geoger.get_county_ids(merge_eisenach=False)).astype(int)): + raise gd.DataError('Not all counties found in DataFrame.') + # Check if all NPIs are in df_npis_old + if len(df_npis_old.NPI_code.unique()) != 1152: + raise gd.DataError('Missing NPIs in DataFrame.') + # check columns of df_npis_old (6 columns with info, 883 dates until 2022/07/31) + if len(df_npis_old.columns) != 889: + raise gd.DataError('Unexpected length of DataFrame.') + # check if Variablenname, Variable and Beschreibung are in df_npis_desc columns + if not ('Variablenname' in df_npis_desc.columns): + raise gd.DataError('Column Variablenname not found.') + if not ('Variable' in df_npis_desc.columns): + raise gd.DataError('Column Variable not found.') + if not ('Beschreibung' in df_npis_desc.columns): + raise gd.DataError('Column Beschreibung not found.') + # df_npis_desc should have 1124 rows + if len(df_npis_desc)!=1224: + raise gd.DataError('Unexpected length of description DataFrame.') + # df_npis_combinations_pre should habe 204 rows (1224/6) + if len(df_npis_combinations_pre)!=204: + raise gd.DataError('Unexpected length of combination DataFrame.') + # combination part should have values NaN and x + for column in df_npis_combinations_pre.columns[5:]: + if (len(df_npis_combinations_pre[column].unique()) != 2) | ('x' not in df_npis_combinations_pre[column].unique()): + raise gd.DataError('Unexpected values in combination matrix.') + + def get_npi_data(fine_resolution=2, file_format=dd.defaultDict['file_format'], @@ -514,6 +543,8 @@ def get_npi_data(fine_resolution=2, print('Download completed.') + npi_sanity_check(df_npis_old, df_npis_desc, df_npis_combinations_pre) + # Compute column index of NPI start (columns with NPIs start with days # which are provided in format dYYYYMMDD). npi_start_col = np.where( @@ -654,8 +685,8 @@ def get_npi_data(fine_resolution=2, if not df_in_valid.drop(columns='Unnamed: 0').equals(df_out): print('Error in combination matrix.') del df_in_valid - except: - pass + except FileNotFoundError: + print('No verification matrix found. Continuing without verifying combination matrix.') if write_file: df_out.to_excel( @@ -849,7 +880,7 @@ def get_npi_data(fine_resolution=2, try: df_population = pd.read_json( directory + "county_current_population.json") - except: + except FileNotFoundError: df_population = gpd.get_population_data() min_date.append( df_infec_rki[dd.EngEng['date']].min().to_pydatetime()) @@ -1054,7 +1085,7 @@ def get_npi_data(fine_resolution=2, df_local_new[dd.EngEng['date']] <= end_date_new), :].reset_index() try: df_local_new = df_local_new.drop(columns='index') - except: + except TypeError: pass # get index of first NPI column in local data frame npis_idx_start = list( @@ -1217,11 +1248,11 @@ def get_npi_data(fine_resolution=2, df_npis.reset_index(inplace=True) try: df_npis = df_npis.drop(columns='index') - except: + except TypeError: pass try: df_npis = df_npis.drop(columns='level_0') - except: + except TypeError: pass #### start validation #### @@ -1356,7 +1387,7 @@ def plot_interaction_matrix(filename, directory): try: codelist = pd.ExcelFile(os.path.join( directory, filename + '.xlsx'), engine='openpyxl').sheet_names - except: + except FileNotFoundError: raise FileNotFoundError('File ' + filename + ' not found.') # invert color map elements for tab20c such that subcolors are shown @@ -1445,32 +1476,6 @@ def plot_interaction_matrix(filename, directory): plt.close() -def plot_activation(df_npis_old, df_final, directory, maincode, subcodes): - counties = df_final.ID_County.unique() - for scode in subcodes: - name = maincode+scode - mentions = df_npis_old[df_npis_old.NPI_code == - name].iloc[:, 6:].sum(axis=0) - for inc_idx in ['_1', '_2', '_3', '_4', '_5']: - mentions += df_npis_old[df_npis_old.NPI_code == - name+inc_idx].iloc[:, 6:].sum(axis=0) - tick_range = (np.arange(int(len(mentions) / 100) + 1) * 100) - plt.xticks(tick_range) - plt.plot(mentions.index, mentions, c='firebrick', - label=name + ' mentioned') - implementations = mentions[54:].copy() - implementations *= 0 - for county in counties: - implementations += df_final[df_final.ID_County == - county].filter(regex=name).sum(axis=1).values - plt.plot(implementations.index, implementations, - c='g', label=name + ' active') - plt.grid(True) - plt.legend(loc='best') - plt.tight_layout - plt.savefig(directory+'npi_plots/'+name) - plt.clf() - def main(): """! Main program entry.""" diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py index 38355d9c53..c1e047279f 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py @@ -282,9 +282,8 @@ def test_drop_codes_and_categories(self): return_value=[[], df_npis_desc['Variablenname'], df_npis_old_renamed]) - @patch('memilio.epidata.getNPIData.plot_counter') - @patch('memilio.epidata.getNPIData.plot_multiple_prescriptions') - def test_get_npi_data(self, mock_plot_mult, mock_plot, mock_codes, mock_read, mock_data): + @patch('memilio.epidata.getNPIData.plot_interaction_matrix') + def test_get_npi_data(self, mock_plot, mock_codes, mock_read, mock_data): # print 'Additional errors in consistent naming' is expected. # print 'WARNING: DataFrame starts with reported cases > 0 for more than 5 percent...' is expected. npis_test = gnd.get_npi_data( @@ -337,8 +336,7 @@ def test_get_npi_data(self, mock_plot_mult, mock_plot, mock_codes, mock_read, mo npis_test.M1_1_1.to_list(), [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) - self.assertEqual(mock_plot.call_count, 4) - self.assertEqual(mock_plot_mult.call_count, 1) + self.assertEqual(mock_plot.call_count, 5) if __name__ == '__main__': From ae4618e6311f0ea596f013e6073a499cf77a9f74 Mon Sep 17 00:00:00 2001 From: patricklnz Date: Mon, 4 Sep 2023 13:44:46 +0200 Subject: [PATCH 099/104] fix test --- .../memilio/epidata/getNPIData.py | 25 +++++++++---------- .../epidata_test/test_epidata_getNPIData.py | 2 +- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 3adc0fdbc4..c7761cd0ac 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -257,6 +257,8 @@ def read_files(directory, fine_resolution): print('File not found.') raise FileNotFoundError + npi_sanity_check(df_npis_old, df_npis_desc, df_npis_combinations_pre) + return df_npis_old, df_npis_desc, df_npis_combinations_pre @@ -543,8 +545,6 @@ def get_npi_data(fine_resolution=2, print('Download completed.') - npi_sanity_check(df_npis_old, df_npis_desc, df_npis_combinations_pre) - # Compute column index of NPI start (columns with NPIs start with days # which are provided in format dYYYYMMDD). npi_start_col = np.where( @@ -676,7 +676,8 @@ def get_npi_data(fine_resolution=2, [desc for desc in npi_codes_prior_desc [npi_codes_prior.isin(codes_local)].values]) - try: + # validate if combinations cleanout file exists, else write this file + if write_file == False: # store verified output df_in_valid = pd.read_excel( os.path.join( @@ -685,10 +686,7 @@ def get_npi_data(fine_resolution=2, if not df_in_valid.drop(columns='Unnamed: 0').equals(df_out): print('Error in combination matrix.') del df_in_valid - except FileNotFoundError: - print('No verification matrix found. Continuing without verifying combination matrix.') - - if write_file: + else: df_out.to_excel( writer, sheet_name=npi_groups_combinations_unique[i]) del df_out @@ -1085,7 +1083,7 @@ def get_npi_data(fine_resolution=2, df_local_new[dd.EngEng['date']] <= end_date_new), :].reset_index() try: df_local_new = df_local_new.drop(columns='index') - except TypeError: + except KeyError: pass # get index of first NPI column in local data frame npis_idx_start = list( @@ -1247,11 +1245,11 @@ def get_npi_data(fine_resolution=2, df_npis.reset_index(inplace=True) try: df_npis = df_npis.drop(columns='index') - except TypeError: + except KeyError: pass try: df_npis = df_npis.drop(columns='level_0') - except TypeError: + except KeyError: pass #### start validation #### @@ -1299,9 +1297,10 @@ def count_code_multiplicities(df_npis_input, df_count, counties_considered, init """ for county in counties_considered: df_local = df_npis_input[df_npis_input[dd.EngEng['idCounty']] == county] - # get column where dates start - npi_start_col = np.where( - df_local.columns.str.startswith('d2') == True)[0][0] + if initial_data_frame: + # get column where dates start + npi_start_col = np.where( + df_local.columns.str.startswith('d2') == True)[0][0] # prepare dictionnary for dates when code was mentioned code_dates = {} # run through all maincodes (i.e., first 3-4 characters like M01a or M11) diff --git a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py index c1e047279f..828ea08569 100644 --- a/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata_test/test_epidata_getNPIData.py @@ -336,7 +336,7 @@ def test_get_npi_data(self, mock_plot, mock_codes, mock_read, mock_data): npis_test.M1_1_1.to_list(), [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) - self.assertEqual(mock_plot.call_count, 5) + self.assertEqual(mock_plot.call_count, 4) if __name__ == '__main__': From 1f770d95207476ffb38e53abc3fced07879cd45f Mon Sep 17 00:00:00 2001 From: annawendler Date: Mon, 4 Sep 2023 14:09:14 +0200 Subject: [PATCH 100/104] adjust sanity check, remove one deepcopy --- pycode/memilio-epidata/memilio/epidata/getNPIData.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index c7761cd0ac..ee4f134d53 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -461,7 +461,7 @@ def npi_sanity_check(df_npis_old, df_npis_desc, df_npis_combinations_pre): raise gd.DataError('Unexpected length of combination DataFrame.') # combination part should have values NaN and x for column in df_npis_combinations_pre.columns[5:]: - if (len(df_npis_combinations_pre[column].unique()) != 2) | ('x' not in df_npis_combinations_pre[column].unique()): + if not np.array_equal(np.sort(df_npis_combinations_pre[column].unique().astype(str)), np.array([np.nan, 'x']): raise gd.DataError('Unexpected values in combination matrix.') @@ -752,8 +752,8 @@ def get_npi_data(fine_resolution=2, key: val for key, val in df_npis_combinations[code][0].items() if key in npis['NPI_code'].values} # remove columns of combinations - df_npis_combinations[code][1] = copy.deepcopy(df_npis_combinations[code][1].loc[local_codes_used_rows, - local_codes_used_cols].reset_index(drop=True)) + df_npis_combinations[code][1] = df_npis_combinations[code][1].loc[local_codes_used_rows, + local_codes_used_cols].reset_index(drop=True) # prepare grouping of NPIs to reduce product space of # NPI x active_from_inc (with values "incidence does not matter", and From 4adad525c78d04db5b7f1b87c8b6a2a3fe36204a Mon Sep 17 00:00:00 2001 From: annawendler Date: Mon, 4 Sep 2023 15:32:36 +0200 Subject: [PATCH 101/104] fix bracket --- pycode/memilio-epidata/memilio/epidata/getNPIData.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index ee4f134d53..2157c75ffe 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -461,7 +461,7 @@ def npi_sanity_check(df_npis_old, df_npis_desc, df_npis_combinations_pre): raise gd.DataError('Unexpected length of combination DataFrame.') # combination part should have values NaN and x for column in df_npis_combinations_pre.columns[5:]: - if not np.array_equal(np.sort(df_npis_combinations_pre[column].unique().astype(str)), np.array([np.nan, 'x']): + if not np.array_equal(np.sort(df_npis_combinations_pre[column].unique().astype(str)), np.array([np.nan, 'x'])): raise gd.DataError('Unexpected values in combination matrix.') From 121e65d7c038e00ba615a6cd290f63b8d28debf8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Mon, 4 Sep 2023 22:53:45 +0200 Subject: [PATCH 102/104] start date default --- pycode/memilio-epidata/memilio/epidata/defaultDict.py | 2 +- pycode/memilio-epidata/memilio/epidata/getDIVIData.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/defaultDict.py b/pycode/memilio-epidata/memilio/epidata/defaultDict.py index e20e6289c5..200830c472 100644 --- a/pycode/memilio-epidata/memilio/epidata/defaultDict.py +++ b/pycode/memilio-epidata/memilio/epidata/defaultDict.py @@ -43,7 +43,7 @@ 'make_plot': False, 'out_folder': default_file_path, 'update_data': False, - 'start_date': date(2020, 4, 24), + 'start_date': date(2020, 1, 1), 'end_date': date.today(), 'split_berlin': False, 'impute_dates': False, diff --git a/pycode/memilio-epidata/memilio/epidata/getDIVIData.py b/pycode/memilio-epidata/memilio/epidata/getDIVIData.py index c6d203c424..f8be9af6ce 100644 --- a/pycode/memilio-epidata/memilio/epidata/getDIVIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getDIVIData.py @@ -50,7 +50,7 @@ def get_divi_data(read_data=dd.defaultDict['read_data'], file_format=dd.defaultDict['file_format'], out_folder=dd.defaultDict['out_folder'], no_raw=dd.defaultDict['no_raw'], - start_date=dd.defaultDict['start_date'], + start_date=date(2020, 4, 24), end_date=dd.defaultDict['end_date'], impute_dates=dd.defaultDict['impute_dates'], moving_average=dd.defaultDict['moving_average'], From efdb30d39f352a9717663715e35ebbd45df39b1e Mon Sep 17 00:00:00 2001 From: annawendler Date: Tue, 5 Sep 2023 09:51:36 +0200 Subject: [PATCH 103/104] change start_comb_matrix, fix npi_sanity_check, remove compareNPIData --- .../memilio/epidata/compareNPIData.py | 98 ------------------- .../memilio/epidata/getNPIData.py | 9 +- 2 files changed, 4 insertions(+), 103 deletions(-) delete mode 100644 pycode/memilio-epidata/memilio/epidata/compareNPIData.py diff --git a/pycode/memilio-epidata/memilio/epidata/compareNPIData.py b/pycode/memilio-epidata/memilio/epidata/compareNPIData.py deleted file mode 100644 index 5d60b786aa..0000000000 --- a/pycode/memilio-epidata/memilio/epidata/compareNPIData.py +++ /dev/null @@ -1,98 +0,0 @@ -import os -import csv -import pandas as pd -import numpy as np - -from memilio.epidata import getDataIntoPandasDataFrame as gd -from memilio.epidata import defaultDict as dd - -directory = '/home/wend_aa/memilio/data/pydata/Germany' - -############################################################################################################# -# read old data for subcategories - -df_npis_old_data = pd.read_csv( - os.path.join(directory, 'kr_massnahmen_unterkategorien.csv'), - sep=',') # , nrows=numberofcities*1248 - -df_npis_old_data.rename(dd.GerEng, axis=1, inplace=True) - -############################################################################################################# -# read new data for subcategories - -codelist = ['m01a', 'm01b', 'm02a', 'm02b', 'm03', 'm04', 'm05', 'm06', 'm07', 'm08', 'm09', - 'm10', 'm11', 'm12', 'm13', 'm14', 'm15', 'm16', 'm17', 'm18', 'm19', 'm20', 'm21'] -counter_codes = 0 -for code in codelist: - print(code) - df_npis_per_code = pd.read_csv( - os.path.join(directory, - f'kr_massn_unterkat_{code}.csv'), - sep=',') - - # set some parameters for dataframe - if counter_codes == 0: - counties = np.sort(df_npis_per_code.ags5.unique()) - num_counties = len(df_npis_per_code.ags5.unique()) - - # extract dates from data - dates = df_npis_per_code.iloc[:int( - df_npis_per_code.shape[0]/num_counties), 5] - # rename dates so that they match dates from other npi dataframe - dates_new = ['d' + date.replace('-', '') for date in dates] - - df_local = [pd.DataFrame() for i in range(num_counties)] - - # set df for all counties - for i in range(0, num_counties): - print(i) - if counter_codes == 0: - df_local[i] = pd.DataFrame(columns=list( - df_npis_per_code.columns[0:5]) + ['code'] + dates_new) - - dummy_to_append = pd.DataFrame(columns=[ - 'code'] + dates_new, data=df_npis_per_code[df_npis_per_code.ags5 == counties[i]].iloc[:, 6:].T.reset_index().values.copy()) - - df_local[i] = pd.concat([df_local[i], dummy_to_append]) - - if df_npis_per_code.iloc[i*len(dates):(i+1)*len(dates), 3].nunique() > 1: - raise gd.DataError('Dates are not sorted as expected.') - - # Set first five columns so that they match old format of data frame (from kr_massnahmen_unterkategorien.csv) - if counter_codes == len(codelist)-1: - df_local[i].iloc[:, 0:5] = df_npis_per_code.iloc[i * - len(dates), 0:5].values - - counter_codes += 1 - -df_npis_new_data = pd.concat([df_local[i] for i in range(num_counties)]) -df_npis_new_data.rename(dd.GerEng, axis=1, inplace=True) -df_npis_new_data['NPI_code'] = df_npis_new_data['NPI_code'].str.replace( - 'code_m', 'M') - - -############################################################################################################# -# compare dataframes - -# check if all rows for code M22, M23 and M24 in df_npis_old_data are empty -codesnotused = ((df_npis_old_data[df_npis_old_data["NPI_code"].str.contains( - "M22|M23|M24")].iloc[:, 6:] == -99).all() == True).all() -if codesnotused == True: - print("Codes M22, M23 and M24 are not used in old data (as expected).") -else: - print("Something wrong with data.") - -# remove rows for codes M22, M23 and M24 from df_npis_old_data -df_npis_old_data = df_npis_old_data[~df_npis_old_data["NPI_code"].str.contains( - "M22|M23|M24")].copy() - -# check how many days are covered in each dataframe and adjust accordingly so that both dataframes have same size -# we already know that df_npis_new_data has more columns than df_npis_old_data -df_npis_new_data = df_npis_new_data.iloc[:, :len(df_npis_old_data.columns)] - -# assert if frames are equal (except index and column '_id') - -if (pd.testing.assert_frame_equal(df_npis_old_data.iloc[:, 1:].reset_index(drop=True), df_npis_new_data.iloc[:, 1:].reset_index(drop=True), check_dtype=False) == None): - print('Data frames are equal.') -else: - print('Data frames are not equal.') diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index 2157c75ffe..8d65bab09f 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -453,7 +453,7 @@ def npi_sanity_check(df_npis_old, df_npis_desc, df_npis_combinations_pre): raise gd.DataError('Column Variable not found.') if not ('Beschreibung' in df_npis_desc.columns): raise gd.DataError('Column Beschreibung not found.') - # df_npis_desc should have 1124 rows + # df_npis_desc should have 1224 rows if len(df_npis_desc)!=1224: raise gd.DataError('Unexpected length of description DataFrame.') # df_npis_combinations_pre should habe 204 rows (1224/6) @@ -461,7 +461,7 @@ def npi_sanity_check(df_npis_old, df_npis_desc, df_npis_combinations_pre): raise gd.DataError('Unexpected length of combination DataFrame.') # combination part should have values NaN and x for column in df_npis_combinations_pre.columns[5:]: - if not np.array_equal(np.sort(df_npis_combinations_pre[column].unique().astype(str)), np.array([np.nan, 'x'])): + if not set(df_npis_combinations_pre[column].unique().astype(str)).issubset(set(['nan','x'])): raise gd.DataError('Unexpected values in combination matrix.') @@ -637,9 +637,8 @@ def get_npi_data(fine_resolution=2, # run through all groups and set possible combinations according to # read combination matrix - # find begin of combination matrix - # there may be multiple columns named '0', so find first '1' column - start_comb_matrix = list(df_npis_combinations_pre.columns).index(1)-1 + # find begin of combination matrix (find first '0' column) + start_comb_matrix = list(df_npis_combinations_pre.columns).index(0) for i in range(len(npi_groups_idx)): codes_local = df_npis_combinations_pre.loc[npi_groups_idx[i], 'Variablenname'].values From 112f7b1bb58c27080540a3191f8c203364c66f9a Mon Sep 17 00:00:00 2001 From: patricklnz Date: Tue, 5 Sep 2023 10:41:31 +0200 Subject: [PATCH 104/104] pre commit --- .../memilio/epidata/getNPIData.py | 87 ++++++++++--------- 1 file changed, 48 insertions(+), 39 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/getNPIData.py b/pycode/memilio-epidata/memilio/epidata/getNPIData.py index c7761cd0ac..1b59c8dfc4 100644 --- a/pycode/memilio-epidata/memilio/epidata/getNPIData.py +++ b/pycode/memilio-epidata/memilio/epidata/getNPIData.py @@ -283,7 +283,7 @@ def activate_npis_based_on_incidence( the incidence-dependent NPI is activated or lifted two days after the threshold is/is not anymore exceeded, additionally considering the number of consecutive days to implement or lift (see second paragraph above). - + Please see the examples for a better understanding. Example 1 (Threshold=3.5): @@ -436,6 +436,7 @@ def drop_codes_and_categories( return codes_dropped, npi_codes_prior, df_npis_old + def npi_sanity_check(df_npis_old, df_npis_desc, df_npis_combinations_pre): # Check if all counties are in df_npis_old if not np.array_equal(df_npis_old.ID_County.unique().astype(int), np.array(geoger.get_county_ids(merge_eisenach=False)).astype(int)): @@ -454,16 +455,15 @@ def npi_sanity_check(df_npis_old, df_npis_desc, df_npis_combinations_pre): if not ('Beschreibung' in df_npis_desc.columns): raise gd.DataError('Column Beschreibung not found.') # df_npis_desc should have 1124 rows - if len(df_npis_desc)!=1224: + if len(df_npis_desc) != 1224: raise gd.DataError('Unexpected length of description DataFrame.') # df_npis_combinations_pre should habe 204 rows (1224/6) - if len(df_npis_combinations_pre)!=204: + if len(df_npis_combinations_pre) != 204: raise gd.DataError('Unexpected length of combination DataFrame.') # combination part should have values NaN and x for column in df_npis_combinations_pre.columns[5:]: if (len(df_npis_combinations_pre[column].unique()) != 2) | ('x' not in df_npis_combinations_pre[column].unique()): raise gd.DataError('Unexpected values in combination matrix.') - def get_npi_data(fine_resolution=2, @@ -545,7 +545,7 @@ def get_npi_data(fine_resolution=2, print('Download completed.') - # Compute column index of NPI start (columns with NPIs start with days + # Compute column index of NPI start (columns with NPIs start with days # which are provided in format dYYYYMMDD). npi_start_col = np.where( df_npis_old.columns.str.contains('d2') == True)[0][0] @@ -602,7 +602,8 @@ def get_npi_data(fine_resolution=2, # rename essential columns and throw away others columns_used = np.where( (df_npis_combinations_pre == 'x').any() == True)[0] - column_names = list(df_npis_combinations_pre.columns[[i for i in columns_used]]) + column_names = list( + df_npis_combinations_pre.columns[[i for i in columns_used]]) rename_columns = {column_names[i]: i for i in range(len(column_names))} df_npis_combinations_pre.rename(columns=rename_columns, inplace=True) df_npis_combinations_pre = df_npis_combinations_pre[[ @@ -753,7 +754,7 @@ def get_npi_data(fine_resolution=2, if key in npis['NPI_code'].values} # remove columns of combinations df_npis_combinations[code][1] = copy.deepcopy(df_npis_combinations[code][1].loc[local_codes_used_rows, - local_codes_used_cols].reset_index(drop=True)) + local_codes_used_cols].reset_index(drop=True)) # prepare grouping of NPIs to reduce product space of # NPI x active_from_inc (with values "incidence does not matter", and @@ -920,7 +921,7 @@ def get_npi_data(fine_resolution=2, for maincode in df_count_joint_codes.keys(): df_count_joint_codes[maincode][1] *= 0 df_counted_joint_codes = count_code_multiplicities(df_npis_old, df_count_joint_codes, - counties_considered=counties_considered) + counties_considered=counties_considered) save_interaction_matrix(df_counted_joint_codes, 'joint_codes', directory) plot_interaction_matrix('joint_codes', directory) @@ -953,7 +954,7 @@ def get_npi_data(fine_resolution=2, cid = 0 countyidx += 1 - ## compute incidence for given county and store in other data frame + # compute incidence for given county and store in other data frame if fine_resolution > 0: # compute incidence based on previous data frames df_infec_local = copy.deepcopy( @@ -997,11 +998,12 @@ def get_npi_data(fine_resolution=2, # Consistency of incidence independent and dependent NPIs: # The same NPI should not be prescribed multiple times at the same day - # for different incidence-dependent thresholds or incidence-independently. + # for different incidence-dependent thresholds or incidence-independently. # In order to avoid contradictions, only retain the strictest mentioned # implementation. Incidence-independent is always stricter than any # incidence-dependent implementation. - print_details = True # define if details are printed (probably to be deactivated) + # define if details are printed (probably to be deactivated) + print_details = True for i in range(int(len(df_local_old)/inc_codes)): # check if access is correct @@ -1011,7 +1013,7 @@ def get_npi_data(fine_resolution=2, [inc_codes * i: inc_codes * (i + 1), npi_start_col - 1].to_list()]): raise gd.DataError('Wrong NPI rows aggregated.') - + sum_npi_inc = np.where( df_local_old.iloc[inc_codes*i:inc_codes*(i+1), npi_start_col:].sum() > 1) if (len(sum_npi_inc[0]) > 0) and print_details: @@ -1089,7 +1091,7 @@ def get_npi_data(fine_resolution=2, npis_idx_start = list( df_local_new.columns).index( npis[dd.EngEng['npiCode']][0]) - + # extract local incidence from local frame local_incid = copy.deepcopy(df_infec_local['Incidence']) @@ -1114,7 +1116,7 @@ def get_npi_data(fine_resolution=2, # merge incidence dependent NPIs to have only one column for each subcode df_local_new_merged = copy.deepcopy(df_local_new.iloc[:, :2]) for subcode in all_subcodes: - # extract columns which have the subcode as part of the column + # extract columns which have the subcode as part of the column # name and sum over all these subcodes df_local_new_merged[subcode] = df_local_new.filter( regex=subcode).sum(axis=1) @@ -1123,17 +1125,19 @@ def get_npi_data(fine_resolution=2, df_incid_depend = pd.concat( [df_incid_depend, copy.deepcopy(df_local_new_merged)]) - if df_local_new_merged.iloc[:,2:].max().max() > 1: + if df_local_new_merged.iloc[:, 2:].max().max() > 1: raise gd.DataError('Error in merging...') - ## Remove conflicting NPIs according to strictness index of Corona- - ## Datenplattform and exclusion criteria defined in df_npis_combinations + # Remove conflicting NPIs according to strictness index of Corona- + # Datenplattform and exclusion criteria defined in df_npis_combinations for maincode in df_npis_combinations.keys(): # get all subcodes subcodes = list(df_npis_combinations[maincode][0].keys()) - subcodes_strictness_values = list(df_npis_combinations[maincode][0].values()) + subcodes_strictness_values = list( + df_npis_combinations[maincode][0].values()) if len(subcodes) != len(subcodes_strictness_values): - raise gd.DataError('Subcode and strictness array inconsistent.') + raise gd.DataError( + 'Subcode and strictness array inconsistent.') # sort index reversely with the strictest (highest) index first idx_strictness_sorted_rev = np.argsort( subcodes_strictness_values)[::-1] @@ -1144,7 +1148,8 @@ def get_npi_data(fine_resolution=2, subcode = subcodes[idx_strictness] # get indices of days where subcode is active - subcode_active = np.where(df_local_new_merged.loc[:, subcode] > 0)[0] + subcode_active = np.where( + df_local_new_merged.loc[:, subcode] > 0)[0] if len(subcode_active) > 0: # get indices of less strict NPIs @@ -1162,7 +1167,7 @@ def get_npi_data(fine_resolution=2, list(set(codes_less_strict).intersection(subcodes_nocombi))) for nocombi_code in subcodes_deactivation: - # check where the less strict NPI is mentioned, only + # check where the less strict NPI is mentioned, only # considering rows where the stricter NPI is mentioned. days_deact = np.where( df_local_new_merged.loc[subcode_active, nocombi_code] > 0)[0] @@ -1173,15 +1178,17 @@ def get_npi_data(fine_resolution=2, str(subcode) + ' on ' + str(len(days_deact)) + ' days.') print('\n') # take subcode_active rows as days_deact is - # numbering inside subcode_active rows only, + # numbering inside subcode_active rows only, # not numbering on the whole df_local_new_merged # data frame - df_local_new_merged.loc[subcode_active, nocombi_code] = 0 + df_local_new_merged.loc[subcode_active, + nocombi_code] = 0 df_count_deactivation[maincode][1].loc[idx_strictness, nocombi_code] += len(days_deact) # count joint codes from after strictness based deactivation - df_count_active = count_code_multiplicities(df_local_new_merged, df_count_active, [countyID], False) + df_count_active = count_code_multiplicities( + df_local_new_merged, df_count_active, [countyID], False) # count joint codes from after incidence based activation df_count_incid_depend = count_code_multiplicities( @@ -1194,7 +1201,8 @@ def get_npi_data(fine_resolution=2, # multiply subcode columns with incidence dependent subcode columns in df_local_new for subcode in all_subcodes: for incidcode in ['', '_1', '_2', '_3', '_4', '_5']: - df_local_new[subcode+incidcode] *= df_local_new_merged[subcode] + df_local_new[subcode + + incidcode] *= df_local_new_merged[subcode] counters[cid] += time.perf_counter()-start_time cid += 1 @@ -1220,7 +1228,8 @@ def get_npi_data(fine_resolution=2, '. Estimated time remaining: ' + str(int(time_remain / 60)) + ' min.') - save_interaction_matrix(df_count_deactivation, 'count_deactivation', directory) + save_interaction_matrix(df_count_deactivation, + 'count_deactivation', directory) plot_interaction_matrix('count_deactivation', directory) if counter_cases_start >= len(counties_considered)*0.05: @@ -1231,7 +1240,8 @@ def get_npi_data(fine_resolution=2, 'Please consider a start date of some weeks ahead of the ' 'time window to be analyzed for NPI\'s effects.') - save_interaction_matrix(df_count_incid_depend, 'joint_codes_incid_depend', directory) + save_interaction_matrix(df_count_incid_depend, + 'joint_codes_incid_depend', directory) plot_interaction_matrix('joint_codes_incid_depend', directory) save_interaction_matrix(df_count_active, 'joint_codes_active', directory) @@ -1309,20 +1319,21 @@ def count_code_multiplicities(df_npis_input, df_count, counties_considered, init code_list = df_count[maincode][1].columns # iterate over code/row indices 0 to n for code_idx in range(len(code_list)): - + # initial data frame (df_npis_old) and reworked new data frames # are transposed (NPIs and dates in rows and columns switched) if initial_data_frame: # get dates where NPI is mentioned as existing in potential intervention set - npi_rows = df_local.NPI_code.str.contains(code_list[code_idx]) + npi_rows = df_local.NPI_code.str.contains( + code_list[code_idx]) npi_dates_in_df = np.where( df_local[npi_rows].iloc[:, npi_start_col:].max() > 0)[0] # store non-transformed dates in code_dict code_dates[code_list[code_idx]] = df_local.iloc[:, - npi_start_col + npi_dates_in_df].columns + npi_start_col + npi_dates_in_df].columns # count number of multiply mentionned NPIs with different incidence thresholds for the same day df_count[maincode][1].iloc[code_idx, code_idx] += df_local[npi_rows].iloc[:, - npi_start_col + npi_dates_in_df].sum().sum() - len(npi_dates_in_df) + npi_start_col + npi_dates_in_df].sum().sum() - len(npi_dates_in_df) else: # get dates where NPI is mentioned as existing in potential intervention set npi_cols = df_local.columns.str.contains( @@ -1333,7 +1344,6 @@ def count_code_multiplicities(df_npis_input, df_count, counties_considered, init code_dates[code_list[code_idx] ] = df_local.iloc[npi_dates_in_df, 0].to_list() - # offdiagonal entries (as before, use that code_dates has been filled for all diagonal entries, i.e., all codes) for maincode in df_count.keys(): code_list = df_count[maincode][1].columns @@ -1347,7 +1357,7 @@ def count_code_multiplicities(df_npis_input, df_count, counties_considered, init code_dates[code_list[code_idx]]).intersection(set(code_dates[code_list[code_idx_other]]))) return df_count - + def save_interaction_matrix(df_interactions, filename, directory): """! Saves interaction matrices for all subcodes in provided main codes. @@ -1377,12 +1387,12 @@ def plot_interaction_matrix(filename, directory): @param[in] filename Filename to read results from. @param[in] directory Directory where to read and save data. - """ + """ target_directory = os.path.join(directory, 'heatmaps_' + filename) if not os.path.exists(target_directory): os.makedirs(target_directory) - try: + try: codelist = pd.ExcelFile(os.path.join( directory, filename + '.xlsx'), engine='openpyxl').sheet_names except FileNotFoundError: @@ -1442,11 +1452,11 @@ def plot_interaction_matrix(filename, directory): else: raise gd.DataError('Unknown filename: ' + filename) - ## plot offdiagonal (interactions between NPIs) + # plot offdiagonal (interactions between NPIs) # Set vmin = 1 so that only combinations that are simultaneously active # at least on one day are in color, else use white. # Set vmax = 1e6 to be adjusted with colormap, this value is larger - # than the maximum in all dataframes, this way colors of heatmaps are + # than the maximum in all dataframes, this way colors of heatmaps are # comparable across different visualizations # (e.g. between codes or between joint_codes and exclusions) plt.imshow(array_exclusion, cmap=cmap, @@ -1458,7 +1468,7 @@ def plot_interaction_matrix(filename, directory): code)), dpi=300) plt.close() - ## plot diagonal (interactions between incidence levels of one NPIs) + # plot diagonal (interactions between incidence levels of one NPIs) plt.figure() positions = [i for i in range(len(df.columns)-1)] plt.yticks(positions, df.columns.to_list()[1:]) @@ -1474,7 +1484,6 @@ def plot_interaction_matrix(filename, directory): plt.close() - def main(): """! Main program entry."""