Skip to content

Commit

Permalink
create df_local_new directly from df_local_old
Browse files Browse the repository at this point in the history
  • Loading branch information
patricklnz committed Jan 9, 2023
1 parent 3d3e4e0 commit 309b5c1
Showing 1 changed file with 39 additions and 28 deletions.
67 changes: 39 additions & 28 deletions pycode/memilio-epidata/memilio/epidata/transformNPIData.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,8 @@ def transform_npi_data(fine_resolution=2,
file_format=dd.defaultDict['file_format'],
out_folder=dd.defaultDict['out_folder'],
start_date=dd.defaultDict['start_date'],
end_date=dd.defaultDict['end_date']
end_date=dd.defaultDict['end_date'],
counties_considered=geoger.get_county_ids()
):
"""! Loads a certain resolution of recorded NPI data from
the Corona Datenplattform and transforms it according to the
Expand Down Expand Up @@ -364,13 +365,13 @@ def transform_npi_data(fine_resolution=2,
npi_codes_prior = df_npis_desc['Variablenname']
npi_codes_prior_desc = df_npis_desc['Variable']

# for fine_resolution == 2 deactivation of non-combinable
# for fine_resolution > 0 deactivation of non-combinable
# incidence-dependent NPIs has to be conducted; therefore we defined a
# matrix of possible combinations of NPIs (marked with an X if combinable)
# NPIs of different main category (e.g., M01a and M04) can always be
# combined; only those of, e.g., M01a_010_3 and M01a_080_4 can exclude each
# other
if fine_resolution == 2:
if fine_resolution > 0:
df_npis_combinations_pre = pd.read_excel(
os.path.join(
directory, 'combination_npis.xlsx'), engine = 'openpyxl')
Expand Down Expand Up @@ -568,15 +569,16 @@ def transform_npi_data(fine_resolution=2,
del npi_codes
del npi_desc
# remove rows and columns of unused codes
for code in df_npis_combinations.keys(): # does not work for fine_resolution!=2
local_codes_used_rows = df_npis_combinations[code][1].Code.isin(
npis.NPI_code)
local_codes_used_cols = df_npis_combinations[code][1].columns.isin(
npis.NPI_code)
if fine_resolution > 0:
for code in df_npis_combinations.keys():
local_codes_used_rows = df_npis_combinations[code][1].Code.isin(
npis.NPI_code)
local_codes_used_cols = df_npis_combinations[code][1].columns.isin(
npis.NPI_code)

# overwrite item 0 since codes are stored in *.columns
df_npis_combinations[code] = df_npis_combinations[code][1].loc[local_codes_used_rows,
local_codes_used_cols].reset_index(drop=True).copy()
# overwrite item 0 since codes are stored in *.columns
df_npis_combinations[code] = df_npis_combinations[code][1].loc[local_codes_used_rows,
local_codes_used_cols].reset_index(drop=True).copy()

# prepare grouping of NPIs to reduce product space of
# NPI x active_from_inc (with values "incidence does not matter", and
Expand Down Expand Up @@ -649,18 +651,16 @@ def transform_npi_data(fine_resolution=2,
incidence_thresholds_to_npis[(
incval, '_' + code_considered.split('_')[2])].append(i)

# get county ids
unique_geo_entities = geoger.get_county_ids()
# check if more than the county of Eisenach would be removed with
# current county list
counties_removed = df_npis_old[
~df_npis_old[dd.EngEng['idCounty']].isin(unique_geo_entities)][
~df_npis_old[dd.EngEng['idCounty']].isin(counties_considered)][
dd.EngEng['idCounty']].unique()
if list(counties_removed) != [16056]:
sys.exit('Error. Other counties than that of Eisenach were removed.')
# remove rows for Eisenach
df_npis_old = df_npis_old[df_npis_old[dd.EngEng['idCounty']].isin(
unique_geo_entities)].reset_index(drop=True)
counties_considered)].reset_index(drop=True)

start_npi_cols = list(
df_npis_old.columns).index(
Expand Down Expand Up @@ -730,7 +730,7 @@ def transform_npi_data(fine_resolution=2,
# replace 2,3,4,5 ("mentioned in ...") by 1 ("mentioned")
df_npis_old.replace([-99, 2, 3, 4, 5], [0, 1, 1, 1, 1], inplace=True)

for countyID in unique_geo_entities:
for countyID in counties_considered:
cid = 0
countyidx += 1

Expand Down Expand Up @@ -770,14 +770,25 @@ def transform_npi_data(fine_resolution=2,
cid += 1

start_time = time.perf_counter()
npis_new = df_local_old.iloc[npi_rows, start_npi_cols-1:].set_index(dd.EngEng['npiCode']).transpose().reset_index(drop=True).copy()

# fill in NPI values by transposing from columns to rows
df_local_new[dd.EngEng['date']] = dates_new
# old dataframe has npi codes as columns and date values as rows
# new dataframe should be transposed
df_local_new = df_local_old.iloc[npi_rows, start_npi_cols-1:].set_index(
dd.EngEng['npiCode']).transpose().copy()
# get datetime as a column (previously index after transposing)
df_local_new = df_local_new.reset_index(
drop=False).rename(
columns={'index': dd.EngEng['date']})
# reset index name (which is dd.EngEng['npiCode'] after transposing)
df_local_new.rename_axis('', axis=1, inplace=True)
# change time format from 'dYYYYMMDD' to datetime timestamps
df_local_new[dd.EngEng['date']] = pd.to_datetime(
df_local_new[dd.EngEng['date']], format='d%Y%m%d')
# fill in column for county ID
df_local_new[dd.EngEng['idCounty']] = countyID
# possible resorting of rows such that they are sorted according to
# a literal sorting of the code strings
df_local_new = pd.concat([df_local_new.copy(), npis_new], axis = 1)
# sort columns as to {Date, ID_County, npi_codes...}
# for now this can be done alphabetically
df_local_new.sort_index(axis=1, inplace=True)

counters[cid] += time.perf_counter()-start_time
cid += 1
Expand Down Expand Up @@ -944,12 +955,12 @@ def transform_npi_data(fine_resolution=2,
# divide working time by completed number of counties and multiply
# by remaining number of counties to estimate time remaining
time_remain = sum(
counters) / countyidx * (len(unique_geo_entities) - countyidx)
counters) / countyidx * (len(counties_considered) - countyidx)
# print progress
if countyidx == 1 or countyidx % int(
len(unique_geo_entities) / 10) == 0:
len(counties_considered) / 10) == 0:
print('Progress ' + str(countyidx) + ' / ' +
str(len(unique_geo_entities)) +
str(len(counties_considered)) +
'. Estimated time remaining: ' +
str(int(time_remain / 60)) + ' min.')

Expand All @@ -973,7 +984,7 @@ def transform_npi_data(fine_resolution=2,
start_date_validation = datetime(2020, 3, 1)
end_date_validation = datetime(2022, 2, 15)

for countyID in unique_geo_entities:
for countyID in counties_considered:
for npiCode in [
'M01a_010', 'M01a_150', 'M05_120', 'M01a_010',
'M18_030', 'M01b_020', 'M02b_035', 'M16_050']:
Expand All @@ -992,7 +1003,7 @@ def transform_npi_data(fine_resolution=2,
start_date_validation = datetime(2020, 3, 1)
end_date_validation = datetime(2022, 2, 15)

for countyID in unique_geo_entities:
for countyID in counties_considered:
for npiCode in [
'M01a_010', 'M01a_150', 'M05_120', 'M01a_010',
'M18_030', 'M01b_020', 'M02b_035', 'M16_050']:
Expand Down Expand Up @@ -1318,4 +1329,4 @@ def main():

if __name__ == "__main__":

main()
main()

0 comments on commit 309b5c1

Please sign in to comment.