def FiveDayAgebinAnalyzer(jdir, wdir, sweep_variables, age_groups_aggregates=None, intervention_analyzer_columns = None):
"""
Run the FiveDayAgebinAnalyzer for OpenMalaria experiments to generate
and save a results dataframe for defined age groups over time (5-day intervals and years).
This function processes simulation outputs, aggregating malaria-related metrics every five days
and across specified age bins and sweep variables. It outputs metrics like prevalence, clinical
incidence, and severe incidence per age group. The processed data is saved to a CSV file.
Args:
jdir (str): Path to the directory containing the scenario data file `scenarios_wseeds.csv`.
wdir (str): Path to the working directory, where the experiment data (`daily.csv`) is stored and output is saved.
sweep_variables (list of str): List of column names used for grouping in the aggregation.
age_groups_aggregates (list of lists, optional): List of age ranges to aggregate in the form of
[[min_age, max_age], ...]. Defaults to standard age bins if not provided.
intervention_analyzer_columns (list, optional): List of intervention specific columns that need to be included
in the final mmmpy_5day.csv file.
Raises:
FileNotFoundError: If required files (`daily.csv`, `EIR_daily.csv`, or `scenarios_wseeds.csv`) are missing.
ValueError: If expected data columns are not found in the input files.
Saves:
mmmpy_5day.csv
Returns:
None
"""
print("Running FiveDayAgebinAnalyzer...", flush=True)
if not age_groups_aggregates:
age_groups_aggregates = [[0, 0.5], [0.5, 1], [1, 2], [2, 5], [5, 10], [10, 15], [15, 20], [20, 100], [0, 5],
[0, 100]]
# Read the EIR data from the CSV file
eir_df = pd.read_csv(os.path.join(wdir, 'EIR_daily.csv'))
eir_df = eir_df.groupby(sweep_variables + ['year', 'timestep'])[
['eir', 'target_output_values', 'n_infectious_mos', 'n_total_mos_pop']].agg('mean').reset_index()
eir_df = eir_df[['index', 'timestep', 'year', 'eir', 'n_infectious_mos', 'n_total_mos_pop']]
# Convert the experiment data to a DataFrame and format it
mean_channels = ['prevalence_2to10']
sum_channels = ['n_inc', 'n_inc_clinical', 'n_inc_severe', 'n_age']
columns_to_keep = ['index', 'timestep', 'year', 'ageGroup', 'age_upper','nPopulation', 'prev'] + sum_channels + mean_channels
df = pd.read_csv(os.path.join(wdir, 'daily.csv'), usecols=columns_to_keep)
df['agebin'] = round(df['age_upper'] / 365, 1)
df['n_prev'] = df['prev'] * df['n_age']
sum_channels += ['n_prev']
df['day'] = (df['timestep'] - 1) % 365 + 1
df['5day'] = df['day'].apply(lambda x: 5 * math.ceil(x / 5))
df = df.groupby(sweep_variables + ['agebin', '5day', 'year']).agg({'n_prev': ['sum'],
'n_inc': ['sum'],
'n_inc_clinical': ['sum'],
'n_inc_severe': ['sum'],
'n_age': ['sum'],
'nPopulation': ['mean'],
'prevalence_2to10': ['mean'],
'prev': ['mean']}).reset_index()
df.columns = df.columns.get_level_values(0)
df = df.rename(columns={'5day': 'day'})
## EIR
eir_df['day'] = (eir_df['timestep'] - 1) % 365 + 1
eir_df['5day'] = eir_df['day'].apply(lambda x: 5 * math.ceil(x / 5))
eir_df = eir_df.groupby(sweep_variables + ['5day', 'year']).agg(
{'eir': ['sum'], 'n_infectious_mos': ['mean'],
'n_total_mos_pop': ['mean'], 'timestep': ['max']}).reset_index()
eir_df.columns = eir_df.columns.get_level_values(0)
eir_df = eir_df.rename(columns={'5day': 'day'})
df_pfpr2to10 = df.groupby(sweep_variables + ['day', 'year'])[['prevalence_2to10']].agg('mean').reset_index()
df = df[sweep_variables + ['agebin', 'day', 'year'] + sum_channels]
df['n_age'] = df['n_age'] / (365 / 73)
df['n_prev'] = df['n_prev'] / (365 / 73)
cdf = pd.DataFrame()
# Loop over age groups to aggregate results data
for i in range(0, len(age_groups_aggregates)):
ages = age_groups_aggregates[i]
ageCond_labels = f'{str(ages[0])}-{str(ages[1])}'
adf = df[(df.agebin > ages[0]) & (df.agebin <= ages[1])]
if adf.empty:
pass
else:
## Aggregate by age group
adf = adf.groupby(sweep_variables + ['day', 'year'])[sum_channels].agg('sum').reset_index()
adf['prevalence'] = adf['n_prev'] / (adf['n_age'])
# events per person per annum (annualized)
# adf['incidence'] = (adf['n_inc'] / adf['n_age']/ 12)
adf['clinical_incidence'] = (adf['n_inc_clinical'] / (adf['n_age'] / 73)) ## 5-Daily to annualized incidence
adf['severe_incidence'] = (adf['n_inc_severe'] / (adf['n_age'] / 73))
adf['ageGroup'] = ageCond_labels
cdf = pd.concat([cdf, adf])
cdf = pd.merge(left=cdf, right=df_pfpr2to10, on=sweep_variables + ['day', 'year'])
scen_df = pd.read_csv(os.path.join(jdir, 'scenarios_wseeds.csv'))
scen_df = scen_df.drop(['entomology_mode'],axis=1, errors='ignore')
cdf = cdf.merge(scen_df, on=sweep_variables, how='inner')
cdf = cdf.merge(eir_df, on=['index', 'day', 'year'], how='inner')
# Severe Incidence Recalculation
cdf['severe_incidence'] = cdf['severe_incidence'] * (0.5 * cdf['cm_severe'] + (1 - cdf['cm_severe']))
# Rename columns for alignment with OpenMalaria results
cdf = cdf.rename({'n_age': 'nHost'}, axis=1)
cdf = cdf.drop(columns=['n_prev', 'n_inc', 'n_inc_clinical', 'n_inc_severe'])
# Save the processed DataFrame to a CSV file
columns_to_keep = ['scen_id', 'seed', 'timestep', 'day', 'year','cm_clinical', 'seasonality', 'target_output_values',
'ageGroup', 'eir', 'prevalence_2to10', 'prevalence',
'clinical_incidence', 'severe_incidence', 'n_total_mos_pop', 'n_infectious_mos']
columns_to_keep += [column for column in intervention_analyzer_columns if column in cdf.columns]
cdf = cdf[columns_to_keep]
print(f'\nSaving outputs to: {wdir}')
cdf.to_csv((os.path.join(wdir, 'mmmpy_5day.csv')), index=False)