def FiveDayAgebinAnalyzer(jdir, wdir, sweep_variables, age_groups_aggregates=None, intervention_analyzer_columns = None):
"""
Run the FiveDayAgebinAnalyzer for OpenMalaria experiments to generate
and save a results dataframe for defined age groups over time (5-day intervals and years).
This function processes simulation outputs, aggregating malaria-related metrics every five days
and across specified age bins and sweep variables. It outputs metrics like prevalence, clinical
incidence, and severe incidence per age group. The processed data is saved to a CSV file.
Args:
jdir (str): Path to the directory containing the scenario data file `scenarios_wseeds.csv`.
wdir (str): Path to the working directory, where the experiment data (`daily.csv`) is stored and output is saved.
sweep_variables (list of str): List of column names used for grouping in the aggregation.
age_groups_aggregates (list of lists, optional): List of age ranges to aggregate in the form of
[[min_age, max_age], ...]. Defaults to standard age bins if not provided.
intervention_analyzer_columns (list, optional): List of intervention specific columns that need to be included
in the final mmmpy_5day.csv file.
Raises:
FileNotFoundError: If required files (`daily.csv`, `EIR_daily.csv`, or `scenarios_wseeds.csv`) are missing.
ValueError: If expected data columns are not found in the input files.
Saves:
mmmpy_5day.csv
Returns:
None
"""
print("Running FiveDayAgebinAnalyzer...", flush=True)
if not age_groups_aggregates:
age_groups_aggregates = [[0, 0.5], [0.5, 1], [1, 2], [2, 5], [5, 10], [10, 15], [15, 20], [20, 100], [0, 5],
[0, 100]]
# Read the EIR data from the CSV file
eir_df = pd.read_csv(os.path.join(wdir, 'EIR_daily.csv'))
eir_df = eir_df.groupby(sweep_variables + ['year', 'timestep'])[
['eir', 'target_output_values', 'n_infectious_mos', 'n_total_mos_pop']].agg('mean').reset_index() # mean across runs per timestep , month and year
eir_df = eir_df[['index', 'timestep', 'year', 'eir', 'n_infectious_mos', 'n_total_mos_pop']]
# Convert the experiment data to a DataFrame and format it
channels_to_keep = ['index', 'timestep', 'year', 'ageGroup', 'age_upper',
'n_inc', 'n_inc_clinical', 'n_inc_severe', 'n_age', 'nPopulation', 'prevalence_2to10', 'prev']
df = pd.read_csv(os.path.join(wdir, 'daily.csv'), usecols=channels_to_keep)
# df = df[channels_to_keep]
df['agebin'] = round(df['age_upper'] / 365, 1)
df['n_prev'] = df['prev'] * df['n_age']
df['day'] = (df['timestep'] - 1) % 365 + 1
df['5day'] = df['day'].apply(lambda x: 5 * math.ceil(x / 5))
df = df.groupby(sweep_variables + ['agebin', '5day', 'year']).agg({'n_prev': ['sum'],
'n_inc': ['sum'],
'n_inc_clinical': ['sum'],
'n_inc_severe': ['sum'],
'n_age': ['sum'],
'nPopulation': ['mean'],
'prevalence_2to10': ['mean'],
'prev': ['mean']}).reset_index()
df.columns = df.columns.get_level_values(0)
df = df.rename(columns={'5day': 'day'})
## EIR
eir_df['day'] = (eir_df['timestep'] - 1) % 365 + 1
eir_df['5day'] = eir_df['day'].apply(lambda x: 5 * math.ceil(x / 5))
eir_df = eir_df.groupby(sweep_variables + ['5day', 'year']).agg(
{'eir': ['sum'], 'n_infectious_mos': ['mean'],
'n_total_mos_pop': ['mean'], 'timestep': ['max']}).reset_index()
eir_df.columns = eir_df.columns.get_level_values(0)
eir_df = eir_df.rename(columns={'5day': 'day'})
df_pfpr2to10 = df.groupby(sweep_variables + ['day', 'year'])[['prevalence_2to10']].agg('mean').reset_index()
df = df[sweep_variables + ['agebin', 'day', 'year', 'n_prev', 'n_inc', 'n_inc_clinical', 'n_inc_severe', 'n_age']]
df['n_age'] = df['n_age'] / (365 / 73)
df['n_prev'] = df['n_prev'] / (365 / 73)
cdf = pd.DataFrame()
# Loop over age groups to aggregate results data
for i in range(0, len(age_groups_aggregates)):
ages = age_groups_aggregates[i]
ageCond_labels = f'{str(ages[0])}-{str(ages[1])}'
adf = df[(df.agebin > ages[0]) & (df.agebin <= ages[1])]
if adf.empty:
pass
else:
## Aggregate by age group
adf = adf.groupby(sweep_variables + ['day', 'year'])[
['n_prev', 'n_inc', 'n_inc_clinical', 'n_inc_severe', 'n_age']].agg('sum').reset_index()
adf['prevalence'] = adf['n_prev'] / (adf['n_age'])
# events per person per annum (annualized)
# adf['incidence'] = (adf['n_inc'] / adf['n_age']/ 12)
adf['clinical_incidence'] = (
adf['n_inc_clinical'] / (adf['n_age'] / 73)) ## 5-Daily to annualized incidence
adf['severe_incidence'] = (adf['n_inc_severe'] / (adf['n_age'] / 73))
adf['ageGroup'] = ageCond_labels
cdf = pd.concat([cdf, adf])
cdf = pd.merge(left=cdf, right=df_pfpr2to10, on=sweep_variables + ['day', 'year'])
scen_df = pd.read_csv(os.path.join(jdir, 'scenarios_wseeds.csv'))
scen_df = scen_df.drop(['entomology_mode'],
axis=1, errors='ignore') ## remove EMOD specific columns
cdf = cdf.merge(scen_df, on=sweep_variables, how='inner')
cdf = cdf.merge(eir_df, on=['index', 'day', 'year'], how='inner')
# Severe Incidence Recalculation
cdf['severe_incidence'] = cdf['severe_incidence'] * (0.5 * cdf['cm_severe'] + (1 - cdf['cm_severe']))
# Rename columns for alignment with OpenMalaria results
cdf['mortality'] = ''
cdf = cdf.rename({'n_age': 'nHost'}, axis=1)
cdf = cdf.drop(columns=['n_prev', 'n_inc', 'n_inc_clinical', 'n_inc_severe'])
# cdf['date'] = cdf.apply(lambda x: datetime.date(int(x['year']), 1, 1) + datetime.timedelta(days=int(x['day']) - 1), axis=1)
print(f'\nSaving outputs to: {wdir}')
# Save the processed DataFrame to a CSV file
columns_to_keep = ['scen_id', 'seed', 'timestep', 'cm_clinical', 'seasonality', 'target_output_values',
'ageGroup', 'eir', 'prevalence_2to10', 'prevalence',
'clinical_incidence', 'severe_incidence', 'n_total_mos_pop', 'n_infectious_mos'] + intervention_analyzer_columns
cdf = cdf[[columns_to_keep]]
cdf.to_csv((os.path.join(wdir, 'mmmpy_5day.csv')), index=False)