| adm0 | adm1 | adm2 | adm3 | orgunitlevel5 | hf | date | month | year | allout_u5 | allout_ov5 | maladm_u5 | maladm_5_14 | maladm_ov15 | maldth_u5 | maldth_5_14 | maldth_ov15 | susp_u5_hf | susp_5_14_hf | susp_ov15_hf | susp_u5_com | susp_5_14_com | susp_ov15_com | maldth_fem_ov15 | maldth_mal_ov15 | maldth_1_59m | maldth_10_14 | maldth_5_9 | tes_neg_rdt_u5_com | tes_pos_rdt_u5_com | tes_neg_rdt_5_14_com | tes_pos_rdt_5_14_com | tes_neg_rdt_ov15_com | tes_pos_rdt_ov15_com | test_neg_mic_u5_hf | test_pos_mic_u5_hf | test_neg_mic_5_14_hf | test_pos_mic_5_14_hf | test_neg_mic_ov15_hf | test_pos_mic_ov15_hf | tes_neg_rdt_u5_hf | tes_pos_rdt_u5_hf | tes_neg_rdt_5_14_hf | tes_pos_rdt_5_14_hf | tes_neg_rdt_ov15_hf | tes_pos_rdt_ov15_hf | maltreat_u24_u5_com | maltreat_ov24_u5_com | maltreat_u24_5_14_com | maltreat_ov24_5_14_com | maltreat_u24_ov15_com | maltreat_ov24_ov15_com | maltreat_u24_u5_hf | maltreat_ov24_u5_hf | maltreat_u24_5_14_hf | maltreat_ov24_5_14_hf | maltreat_u24_ov15_hf | maltreat_ov24_ov15_hf | allout | susp | test_hf | test_com | test | conf_hf | conf_com | conf | maltreat_com | maltreat_hf | maltreat | pres_com | pres_hf | pres | maladm | maldth | test_hf_u5 | test_hf_5_14 | test_hf_ov15 | test_com_u5 | test_com_5_14 | test_com_ov15 | test_u5 | test_5_14 | test_ov15 | susp_hf_u5 | susp_hf_5_14 | susp_hf_ov15 | susp_com_u5 | susp_com_5_14 | susp_com_ov15 | susp_u5 | susp_5_14 | susp_ov15 | conf_hf_u5 | conf_hf_5_14 | conf_hf_ov15 | conf_com_u5 | conf_com_5_14 | conf_com_ov15 | conf_u5 | conf_5_14 | conf_ov15 | maltreat_hf_u5 | maltreat_hf_5_14 | maltreat_hf_ov15 | maltreat_com_u5 | maltreat_com_5_14 | maltreat_com_ov15 | maltreat_u5 | maltreat_5_14 | maltreat_ov15 | maltreat_u24_hf | maltreat_ov24_hf | maltreat_u24_com | maltreat_ov24_com | maltreat_u24_total | maltreat_ov24_total | pres_com_u5 | pres_com_5_14 | pres_com_ov15 | pres_hf_u5 | pres_hf_5_14 | pres_hf_ov15 | pres_u5 | pres_5_14 | pres_ov15 | hf_uid | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Sierra Leone | Bo District | Bo City Council | Bo City | Aethel CHP | Aethel CHP | 2021-01 | 01 | 2021 | 44 | 48 | nan | nan | nan | nan | nan | nan | 45 | 30 | 45 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 9 | 36 | 8 | 22 | 9 | 36 | nan | nan | nan | nan | nan | nan | 6 | 30 | nan | 20 | 4 | 31 | 92 | 120 | 120 | nan | 120 | 94 | nan | 94 | nan | 91 | 91 | nan | 0 | 0 | nan | nan | 45 | 30 | 45 | nan | nan | nan | 45 | 30 | 45 | 45 | 30 | 45 | nan | nan | nan | 45 | 30 | 45 | 36 | 22 | 36 | nan | nan | nan | 36 | 22 | 36 | 36 | 20 | 35 | nan | nan | nan | 36 | 20 | 35 | 10 | 81 | nan | nan | 10 | 81 | nan | nan | nan | 0 | 0 | 0 | 0 | 0 | 0 | hf_0001 |
| 1 | Sierra Leone | Bo District | Bo City Council | Bo City | Aethel CHP | Aethel CHP | 2021-02 | 02 | 2021 | 37 | 27 | nan | nan | nan | nan | nan | nan | 28 | 40 | 24 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 8 | 20 | 10 | 30 | 8 | 16 | nan | nan | nan | nan | nan | nan | 3 | 17 | 25 | 5 | nan | 16 | 64 | 92 | 92 | nan | 92 | 66 | nan | 66 | nan | 66 | 66 | nan | 0 | 0 | nan | nan | 28 | 40 | 24 | nan | nan | nan | 28 | 40 | 24 | 28 | 40 | 24 | nan | nan | nan | 28 | 40 | 24 | 20 | 30 | 16 | nan | nan | nan | 20 | 30 | 16 | 20 | 30 | 16 | nan | nan | nan | 20 | 30 | 16 | 28 | 38 | nan | nan | 28 | 38 | nan | nan | nan | 0 | 0 | 0 | 0 | 0 | 0 | hf_0001 |
| 2 | Sierra Leone | Bo District | Bo City Council | Bo City | Aethel CHP | Aethel CHP | 2021-03 | 03 | 2021 | 44 | 42 | nan | nan | nan | nan | nan | nan | 21 | 32 | 201 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 10 | nan | 15 | nan | 100 | nan | nan | 18 | nan | 22 | nan | 186 | nan | nan | nan | nan | nan | nan | 8 | nan | 7 | nan | 86 | nan | 86 | 254 | 351 | nan | 351 | 226 | nan | 226 | nan | 101 | 101 | nan | 0 | 0 | nan | nan | 28 | 37 | 286 | nan | nan | nan | 28 | 37 | 286 | 21 | 32 | 201 | nan | nan | nan | 21 | 32 | 201 | 18 | 22 | 186 | nan | nan | nan | 18 | 22 | 186 | 8 | 7 | 86 | nan | nan | nan | 8 | 7 | 86 | 101 | nan | nan | nan | 101 | nan | nan | nan | nan | 0 | 0 | 0 | 0 | 0 | 0 | hf_0001 |
| 3 | Sierra Leone | Bo District | Bo City Council | Bo City | Aethel CHP | Aethel CHP | 2021-04 | 04 | 2021 | 28 | 23 | nan | nan | nan | nan | nan | nan | 37 | 40 | 42 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 7 | 30 | 10 | 30 | 12 | 30 | nan | nan | nan | nan | nan | nan | 10 | 20 | 30 | nan | 20 | 10 | 51 | 119 | 119 | nan | 119 | 90 | nan | 90 | nan | 90 | 90 | nan | 0 | 0 | nan | nan | 37 | 40 | 42 | nan | nan | nan | 37 | 40 | 42 | 37 | 40 | 42 | nan | nan | nan | 37 | 40 | 42 | 30 | 30 | 30 | nan | nan | nan | 30 | 30 | 30 | 30 | 30 | 30 | nan | nan | nan | 30 | 30 | 30 | 60 | 30 | nan | nan | 60 | 30 | nan | nan | nan | 0 | 0 | 0 | 0 | 0 | 0 | hf_0001 |
| 4 | Sierra Leone | Bo District | Bo City Council | Bo City | Aethel CHP | Aethel CHP | 2021-05 | 05 | 2021 | 138 | 141 | nan | nan | nan | nan | nan | nan | 100 | 63 | 36 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 28 | 72 | 23 | 40 | 1 | 35 | nan | nan | nan | nan | nan | nan | 10 | 35 | 15 | 25 | 10 | 25 | 279 | 199 | 199 | nan | 199 | 147 | nan | 147 | nan | 120 | 120 | nan | 0 | 0 | nan | nan | 100 | 63 | 36 | nan | nan | nan | 100 | 63 | 36 | 100 | 63 | 36 | nan | nan | nan | 100 | 63 | 36 | 72 | 40 | 35 | nan | nan | nan | 72 | 40 | 35 | 45 | 40 | 35 | nan | nan | nan | 45 | 40 | 35 | 35 | 85 | nan | nan | 35 | 85 | nan | nan | nan | 0 | 0 | 0 | 0 | 0 | 0 | hf_0001 |
| 5 | Sierra Leone | Bo District | Bo City Council | Bo City | Aethel CHP | Aethel CHP | 2021-06 | 06 | 2021 | 127 | 82 | nan | nan | nan | nan | nan | nan | 91 | 19 | 75 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 32 | 59 | 3 | 16 | 24 | 51 | nan | nan | nan | nan | nan | nan | nan | 36 | nan | 16 | nan | 34 | 209 | 185 | 185 | nan | 185 | 126 | nan | 126 | nan | 86 | 86 | nan | 0 | 0 | nan | nan | 91 | 19 | 75 | nan | nan | nan | 91 | 19 | 75 | 91 | 19 | 75 | nan | nan | nan | 91 | 19 | 75 | 59 | 16 | 51 | nan | nan | nan | 59 | 16 | 51 | 36 | 16 | 34 | nan | nan | nan | 36 | 16 | 34 | nan | 86 | nan | nan | nan | 86 | nan | nan | nan | 0 | 0 | 0 | 0 | 0 | 0 | hf_0001 |
| 6 | Sierra Leone | Bo District | Bo City Council | Bo City | Aethel CHP | Aethel CHP | 2021-07 | 07 | 2021 | 130 | 175 | nan | nan | nan | nan | nan | nan | 114 | 32 | 118 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 28 | 86 | 8 | 24 | 22 | 96 | nan | nan | nan | nan | nan | nan | nan | 86 | nan | 23 | nan | 96 | 305 | 264 | 264 | nan | 264 | 206 | nan | 206 | nan | 205 | 205 | nan | 0 | 0 | nan | nan | 114 | 32 | 118 | nan | nan | nan | 114 | 32 | 118 | 114 | 32 | 118 | nan | nan | nan | 114 | 32 | 118 | 86 | 24 | 96 | nan | nan | nan | 86 | 24 | 96 | 86 | 23 | 96 | nan | nan | nan | 86 | 23 | 96 | nan | 205 | nan | nan | nan | 205 | nan | nan | nan | 0 | 0 | 0 | 0 | 0 | 0 | hf_0001 |
| 7 | Sierra Leone | Bo District | Bo City Council | Bo City | Aethel CHP | Aethel CHP | 2021-08 | 08 | 2021 | 144 | 203 | nan | nan | nan | nan | nan | nan | 125 | 22 | 129 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 32 | 93 | 5 | 17 | 31 | 98 | nan | nan | nan | nan | nan | nan | 9 | 84 | 1 | 16 | 18 | 80 | 347 | 276 | 276 | nan | 276 | 208 | nan | 208 | nan | 208 | 208 | nan | 0 | 0 | nan | nan | 125 | 22 | 129 | nan | nan | nan | 125 | 22 | 129 | 125 | 22 | 129 | nan | nan | nan | 125 | 22 | 129 | 93 | 17 | 98 | nan | nan | nan | 93 | 17 | 98 | 93 | 17 | 98 | nan | nan | nan | 93 | 17 | 98 | 28 | 180 | nan | nan | 28 | 180 | nan | nan | nan | 0 | 0 | 0 | 0 | 0 | 0 | hf_0001 |
| 8 | Sierra Leone | Bo District | Bo City Council | Bo City | Aethel CHP | Aethel CHP | 2021-09 | 09 | 2021 | 159 | 159 | nan | nan | nan | nan | nan | nan | 135 | 28 | 102 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 57 | 78 | 10 | 18 | 32 | 70 | nan | nan | nan | nan | nan | nan | nan | 78 | nan | 18 | nan | 70 | 318 | 265 | 265 | nan | 265 | 166 | nan | 166 | nan | 166 | 166 | nan | 0 | 0 | nan | nan | 135 | 28 | 102 | nan | nan | nan | 135 | 28 | 102 | 135 | 28 | 102 | nan | nan | nan | 135 | 28 | 102 | 78 | 18 | 70 | nan | nan | nan | 78 | 18 | 70 | 78 | 18 | 70 | nan | nan | nan | 78 | 18 | 70 | nan | 166 | nan | nan | nan | 166 | nan | nan | nan | 0 | 0 | 0 | 0 | 0 | 0 | hf_0001 |
| 9 | Sierra Leone | Bo District | Bo City Council | Bo City | Aethel CHP | Aethel CHP | 2021-10 | 10 | 2021 | 88 | 128 | nan | nan | nan | nan | nan | nan | 9 | 92 | 100 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 3 | 6 | 12 | 80 | 27 | 73 | nan | nan | nan | nan | nan | nan | nan | 6 | 31 | 49 | 13 | 60 | 216 | 201 | 201 | nan | 201 | 159 | nan | 159 | nan | 159 | 159 | nan | 0 | 0 | nan | nan | 9 | 92 | 100 | nan | nan | nan | 9 | 92 | 100 | 9 | 92 | 100 | nan | nan | nan | 9 | 92 | 100 | 6 | 80 | 73 | nan | nan | nan | 6 | 80 | 73 | 6 | 80 | 73 | nan | nan | nan | 6 | 80 | 73 | 44 | 115 | nan | nan | 44 | 115 | nan | nan | nan | 0 | 0 | 0 | 0 | 0 | 0 | hf_0001 |
DHIS2 data preprocessing
Overview
DHIS2 is a core data source for routine malaria surveillance, capturing indicators such as outpatient case counts, diagnostic testing, and treatment provision. However, raw DHIS2 exports may not be ready for SNT analysis if they have structural or quality issues, are messy, include inconsistent or unreadable column names, missing values, mismatched facility names, duplicate records, and formats that vary across months or locations. To make this data usable for SNT, it needs to be cleaned, reshaped, and aligned with a spatial and temporal structure that supports later steps of analysis: identifying active and inactive health facilities, calculating reporting rates, managing outliers, calculating malaria incidence, etc.
In this section, we walk through how to preprocess monthly DHIS2 malaria records to make them SNT-ready, using example data from Sierra Leone. This includes cleaning and standardising facility-level data, aggregating to administrative levels where needed, to produce structured datasets.
The DHIS2 data you are working with may require some or all of the specific steps detailed below, or additional steps may be required to prepare the data for analysis.
- Import and/or combine DHIS2 data files
- Select and rename key variables
- Compute new variables at the facility level (e.g. presumed cases)
- Set up any UIDs needed
- Troubleshoot potential issues along the way
- Aggregate data monthly and yearly at the selected administrative level
- Export cleaned and aggregated datasets
Key considerations
Troubleshooting
Routine surveillance data can be trickier to work with than other data sources that are more standardized. Expect to encounter and resolve data issues throughout the DHIS2 data preprocessing process. You will likely need to:
- Investigate unexpected patterns in your data
- Make judgment calls about how to handle inconsistencies
- Collaborate with your SNT team on country-specific issues
- Develop custom solutions for problems not covered here
The troubleshooting approaches shown in this guide provide a starting framework, but real-world data cleaning often requires iterative problem-solving and local expertise. While this guide addresses challenges others have encountered when preparing DHIS2 data for analysis, every country is different. Your specific context may present unique problems that are not yet covered in the code library. We encourage you to be persistent and creative if needed and to consult others for help if you are unable to resolve on your own.
If you encounter a new challenge when preprocessing DHIS2 data and would like new content to be added to the code library, please fill out our feedback form.
Best practices when working with routine data
1. Understand the System’s Context
- Reporting workflows: Know how data moves from clinics to DHIS2 (paper vs. direct entry, aggregation rules).
- Indicator definitions: Confirm case defintions and inclusions
2. Triangulate Strategically
Cross-check with:
- External benchmarks (e.g., HMIS reports, survey data)
- Parallel systems (e.g., stock management data for testing kits)
- Temporal patterns (compare dry vs. rainy season trends)
3. Document Your Process
Maintain a troubleshooting log tracking:
- Issues encountered
- Data sources consulted
- Decisions made (with rationale)
Timing lags: Facilities may report late, especially after holidays or system outages.
What a clean and well-structured dataset should look like
Your data might come in different format to Sierra Leone example below. Whether you are able to use the chunk of code provided, or parts of it, to import your data, please make sure your data, after import, contains the following information:
- A column with the date (year and month) of the report
- A column with the health facility name
- Various columns with parent admin units - at mimimum you will need the name of the admin unit for SNT analysis (for example, chiefdom for Sierra Leone)
- Your data indicators columns
Below is an example of what the data should look like after import:
VT: Do we want to select specific indicator columns to avoid showing too many NA values?
Step-by-Step
In this section, we walk through the key preprocessing steps needed to get DHIS2 malaria data ready for SNT analysis, using example DHIS2 data from Sierra Leone. The focus here is on cleaning, reshaping, and aggregating facility-level data to produce structured outputs at both the facility and administrative levels.
Each step is designed to guide you through the process cleanly. Make sure to follow the notes in the code, especially where edits are required (e.g. column names or admin levels). The goal is to make the data analysis-ready without breaking linkages to spatial units or reporting structures.
Remember that you will need to adapt this process to the specific country context in which you are working.
To skip the step-by-step explanation, jump to the full code at the end of this page.
Step 1: Import packages and data
We’ll use Sierra Leone DHIS2 malaria data as a working example. Variable names or database structures may differ across countries. Be sure to review and update the variable names to match your country’s DHIS2 configuration.
We begin by installing and loading the required packages for our preprocessing, then import the data.
If you are using python and need help installing packages, please refer to the Getting Started section.
Step 1.1: Import packages
# Install pacman only if it's not already installed
if (!requireNamespace("pacman", quietly = TRUE)) {
install.packages("pacman")
}
# install or load relevant packages
pacman::p_load(
tidyverse, # core tidy tools (dplyr, tidyr, etc.)
rio, # import and export and file format
DT, # view interactive tables
here # for easy file referencing
)Step 1.2: Import data
The code chunk below defines a helper function to import different types of data, for example Excel or csv data. If your DHIS2 extraction resulted in multiple files, this function will merge them together into a single dataset. Then we use it to read in our data file(s).
When calling the DHIS2 data import function, we read files from a folder using the here package. The here package builds paths from the project root, keeping code portable and reliable.
Before running any code, take a moment to visually inspect your data. This is especially important when working with DHIS2 data or other routine health sources. As explained in the Data Structures of the Getting Started: For Analysts page, your data should follow tidy data principles: each column represents a variable, each row an observation, and each cell contains a single value.
To check this, open the Excel or CSV file directly and review:
- Headers: Are the column names in the first row? Are they clear and consistent?
- Columns and rows:
- Each column should represent a variable (e.g.
health_facility,month,confirmed_cases). - Each row should represent a single observation (e.g. one facility-month).
- Each column should represent a variable (e.g.
- Cells: Each cell should contain a single value (not merged or grouped). Avoid cells that contain multiple values (e.g.
10/5for tested/positive). - Extra rows: Remove any top rows used for titles, notes, or merged headers.
If your dataset is already tidy, you’re ready to move on. If not, try to clean it up before working with it. You can do this:
In Excel, by editing a copy of the file and removing any extra formatting, then saving your new copy to import and work with in later steps.
Or in R, using packages like:
rio::import(file, skip = X)to skip X number of extra header rowstidyr::pivot_longer()to reshape wide data into long formatdplyr::mutate()andseparate()to split combined values into separate columns
Or in Python, using:
pandas.read_excel(file, skiprows=X)to skip X number of unwanted header rowspandas.melt()to reshape wide data into long formatpandas.Series.str.split()andpandas.assign()to split and create new columns
For more complex formatting issues, like multi-row or merged headers, see this excellent step-by-step guide: Tidying Multi-Header Excel Data with R by Paul Campbell (2019).
Getting this right early will save time and reduce confusion later in the workflow, as all the subsequent steps assume that the data is already in tidy format and ready for analysis.
Show the code
#' Reads and combines multiple Excel or CSV files from a directory into a single
#' dataframe.
#'
#' This function searches for files with specified extension in the given
#' directory and combines them into a single dataframe. It handles both Excel
#' (.xls, .xlsx) and CSV files.
#'
#' @param directory Path to directory containing files
#' @param pattern File extension (default: "xls"). Options: "xls", "xlsx",
#' "csv" and and other formats available in rio package.
#' @return A dataframe combining all files with trimmed column names. Returns
#' NULL if no files found or if an error occurs during reading/combining.
#' @examples
#' # Read Excel files
#' df1 <- read_and_combine_files("path/to/excel", "xlsx")
#' # Read CSV files
#' df2 <- read_and_combine_files("path/to/csv", "csv")
#'
read_and_combine_files <- function(
directory,
pattern = "xls"
) {
# directory <- here::here(directory)
pattern <- paste0("\\.", pattern, "$")
# Find files matching the pattern
file_paths <- list.files(
path = directory,
pattern = pattern,
full.names = TRUE
)
# Stop if no files found
if (length(file_paths) == 0) {
stop("No files found with specified extension in the directory.")
}
# Read all files based on extension
data_files <- lapply(file_paths, function(file) {
tryCatch(
{
rio::import(file)
},
error = function(e) {
NULL
}
)
})
# Combine all data frames
combined_df <- dplyr::bind_rows(data_files) |>
dplyr::as_tibble()
# Trim whitespace from column names
names(combined_df) <- trimws(names(combined_df))
return(combined_df)
}
# read in data files
dhis2_df <- read_and_combine_files(
directory = here::here(
"1.1.2_epidemiology",
"1.1.2a_routine_surveillance",
"raw"
),
pattern = "xls"
)
head(dhis2_df)To adapt the code:
- Line 58: Change
directory = "1.1.2_epidemiology..."to your current working directory - Line 63: Modify the file extension pattern according to your file type:
- For
.xlsfiles:pattern = "xls" - For
.xlsxfiles:pattern = "xlsx - For
.csvfiles:pattern = "csv"
- For
Show the code
# set up directory
directory = Path(here("english/data_r/routine_cases/dhis2_data"))
extension = 'xls'
# find all files with specified extension in directory
files = list(directory.glob(f'*.{extension}'))
# initialise dataframe
dhis2_df = pd.DataFrame()
# iterate over files, concatenate into one dataframe
for file in files:
temp = pd.read_excel(file, sheet_name = 'Sheet1')
dhis2_df = pd.concat([dhis2_df, temp])
# Inspect results
dhis2_df.head(10).styleTo adapt the code:
- Line 2: Change
directory = "1.1.2_epidemiology..."to your current working directory - Line 3: Modify the file extension pattern according to your file type:
- For
.xlsfiles:pattern = "xls" - For
.xlsxfiles:pattern = "xlsx - For
.csvfiles:pattern = "csv"
- For
Step 2: First pass troubleshooting
Step 2.1: Basic cleaning
Basic cleaning should be performed based on initial inspection of the imported data. Cleaning at this stage covers best practices of preprocessing.
HERE I WANT TO PUT EXTREMELY UNCONTROVERTIAL THINGS THAT IT WOULDN’T BE NEEDED TO CONSULT SNT TEAM ON, LIKE IN VAL’S EXAMPLE 2 ITEMS. WE COULD EVEN STASH THE YEAR-MONTH THING HERE.
Notes: include a diagnostic where you print number of NA per variable. some variables (admin unit name, periodname for ex) really ought not be NA and if they appear, they ought to be investigated with the DHIS2 expert / SNT team focal person for routine data
# Remove whitespace from column names
dhis2_df <- dhis2_df |>
dplyr::rename_with(~ stringr::str_trim(.x))
# Remove duplicate rows and reset index
dhis2_df <- dhis2_df |>
dplyr::distinct() |>
dplyr::mutate(index = dplyr::row_number(), .before = 1) |>
dplyr::select(-index)
# NA count by column
na_counts <- base::colSums(base::is.na(dhis2_df))
na_counts <- na_counts[na_counts > 0] # Show only variables with NAs
if (base::length(na_counts) > 0) {
message("Variables with missing values:")
print(na_counts)
# Check critical variables
critical_vars <- c("adm0", "adm1", "adm2", "hf", "periodname")
critical_na <- na_counts[base::names(na_counts) %in% critical_vars]
if (base::length(critical_na) > 0) {
message("\n❌ CRITICAL ISSUE: Missing values in:")
print(critical_na)
message("Contact DHIS2 expert/SNT team immediately")
}
} else {
message("✅ No missing values found in any variables")
}Step 2.2: (If needed) Troubleshoot initial import (maybe multiple steps here)
Check for problems and handle stuff like:
- Inconsistent columns across files
- Missing admin unit names
- Name matching of HFs across multiple extracts
Step 3: Rename columns
Review variable definitions for your country to better understand and appropriately analyze the data. Consider investigating questions like:
- Are admissions included in outpatients?
- Are data on pregnant women included in the data for adults?
- Is there double counting between RDT and microscopy results? If yes, is it appropriate to only use RDT results?
- Are data from the private sector included here? If so, what percentage of the private sector reports into DHIS2?
- Are data from commmunity health workers included in their assigned health facility data or are data separate?
- Have any variables been included or adapted throughout the years? If so, how should they be treated throughout the time series?
Always confirm variable definitions with the SNT team.
Step 3.1: Assign renaming rules
The rename_columns() function (for R) or renaming dictionaries (for python) standardises the column names in the Sierra Leone DHIS2 malaria dataset. If you’re working with your own country’s data, you only need to adjust the column names to match your DHIS2 data dictionary. For example, if your dataset uses “Admission of child with malaria - 5-14yrs” instead of “Admission - Child with malaria 5-14 years_X”, simply update the relevant names in rename_columns() (for R), d_cols_admin and/or d_cols_data (for python) accordingly.
Show the code
#' Rename columns in a dataframe according to a predefined mapping
#'#' @param df A dataframe containing columns to be renamed
#'
#' @return
#' Returns the dataframe with renamed columns. Returns NULL if an error occurs.
#' @examples
#' df <- data.frame(adm0 = 1:3, allout_u5 = 4:6)
#' renamed_df <- rename_columns(df)
rename_columns <- function(df) {
# rename columns
rename_cols <- c(
# Organisational units
"adm0" = "orgunitlevel1",
"adm1" = "orgunitlevel2",
"adm2" = "orgunitlevel3",
"adm3" = "orgunitlevel4",
"hf" = "organisationunitname",
# All outpatient visits
"allout_u5" = "OPD (New and follow-up curative) 0-59m_X",
"allout_ov5" = "OPD (New and follow-up curative) 5+y_X",
# Malaria admissions
"maladm_u5" = "Admission - Child with malaria 0-59 months_X",
"maladm_5_14" = "Admission - Child with malaria 5-14 years_X",
"maladm_ov15" = "Admission - Malaria 15+ years_X",
# Malaria deaths
"maldth_1_59m" = "Child death - Malaria 1-59m_X",
"maldth_10_14" = "Child death - Malaria 10-14y_X",
"maldth_5_9" = "Child death - Malaria 5-9y_X",
"maldth_fem_ov15" = "Death malaria 15+ years Female",
"maldth_mal_ov15" = "Death malaria 15+ years Male",
"maldth_u5" = "Separation - Child with malaria 0-59 months_X Death",
"maldth_5_14" = "Separation - Child with malaria 5-14 years_X Death",
"maldth_ov15" = "Separation - Malaria 15+ years_X Death",
# Suspected cases
"susp_u5_hf" = "Fever case - suspected Malaria 0-59m_X",
"susp_5_14_hf" = "Fever case - suspected Malaria 5-14y_X",
"susp_ov15_hf" = "Fever case - suspected Malaria 15+y_X",
"susp_u5_com" = "Fever case in community (Suspected Malaria) 0-59m_X",
"susp_5_14_com" = "Fever case in community (Suspected Malaria) 5-14y_X",
"susp_ov15_com" = "Fever case in community (Suspected Malaria) 15+y_X",
# Malaria tests conducted at community level
"tes_neg_rdt_u5_com" =
"Fever case in community tested for Malaria (RDT) - Negative 0-59m_X",
"tes_pos_rdt_u5_com" =
"Fever case in community tested for Malaria (RDT) - Positive 0-59m_X",
"tes_neg_rdt_5_14_com" =
"Fever case in community tested for Malaria (RDT) - Negative 5-14y_X",
"tes_pos_rdt_5_14_com" =
"Fever case in community tested for Malaria (RDT) - Positive 5-14y_X",
"tes_neg_rdt_ov15_com" =
"Fever case in community tested for Malaria (RDT) - Negative 15+y_X",
"tes_pos_rdt_ov15_com" =
"Fever case in community tested for Malaria (RDT) - Positive 15+y_X",
# Malaria tests conducted at health facility level
"test_neg_mic_u5_hf" =
"Fever case tested for Malaria (Microscopy) - Negative 0-59m_X",
"test_pos_mic_u5_hf" =
"Fever case tested for Malaria (Microscopy) - Positive 0-59m_X",
"test_neg_mic_5_14_hf" =
"Fever case tested for Malaria (Microscopy) - Negative 5-14y_X",
"test_pos_mic_5_14_hf" =
"Fever case tested for Malaria (Microscopy) - Positive 5-14y_X",
"test_neg_mic_ov15_hf" =
"Fever case tested for Malaria (Microscopy) - Negative 15+y_X",
"test_pos_mic_ov15_hf" =
"Fever case tested for Malaria (Microscopy) - Positive 15+y_X",
"tes_neg_rdt_u5_hf" =
"Fever case tested for Malaria (RDT) - Negative 0-59m_X",
"tes_pos_rdt_u5_hf" =
"Fever case tested for Malaria (RDT) - Positive 0-59m_X",
"tes_neg_rdt_5_14_hf" =
"Fever case tested for Malaria (RDT) - Negative 5-14y_X",
"tes_pos_rdt_5_14_hf" =
"Fever case tested for Malaria (RDT) - Positive 5-14y_X",
"tes_neg_rdt_ov15_hf" =
"Fever case tested for Malaria (RDT) - Negative 15+y_X",
"tes_pos_rdt_ov15_hf" =
"Fever case tested for Malaria (RDT) - Positive 15+y_X",
# Malaria tests conducted at community level
"maltreat_u24_u5_com" =
"Malaria treated in community with ACT <24 hours 0-59m_X",
"maltreat_ov24_u5_com" =
"Malaria treated in community with ACT >24 hours 0-59m_X",
"maltreat_u24_5_14_com" =
"Malaria treated in community with ACT <24 hours 5-14y_X",
"maltreat_ov24_5_14_com" =
"Malaria treated in community with ACT >24 hours 5-14y_X",
"maltreat_u24_ov15_com" =
"Malaria treated in community with ACT <24 hours 15+y_X",
"maltreat_ov24_ov15_com" =
"Malaria treated in community with ACT >24 hours 15+y_X",
# Malaria tests conducted at community level
"maltreat_u24_u5_hf" =
"Malaria treated with ACT <24 hours 0-59m_X",
"maltreat_ov24_u5_hf" =
"Malaria treated with ACT >24 hours 0-59m_X",
"maltreat_u24_5_14_hf" =
"Malaria treated with ACT <24 hours 5-14y_X",
"maltreat_ov24_5_14_hf" =
"Malaria treated with ACT >24 hours 5-14y_X",
"maltreat_u24_ov15_hf" =
"Malaria treated with ACT <24 hours 15+y_X",
"maltreat_ov24_ov15_hf" =
"Malaria treated with ACT >24 hours 15+y_X"
)
df <- df |>
dplyr::rename(!!!rename_cols)
return(df)
}To adapt the code:
- Lines 15–114: Modify the column names (add, edit, remove) to match those specific to your country’s data dictionary and its SNT needs
- Once updated, run the code
Show the code
# Here we define standardised column names, in relation to the names we have in the dataset. We will then use this dictionary to rename columns.
d_cols_admin = {
# Organisational units
'adm0' : 'orgunitlevel1',
'adm1' : 'orgunitlevel2',
'adm2' : 'orgunitlevel3',
'adm3' : 'orgunitlevel4',
'hf' : 'organisationunitname',
'YM': 'periodname'}
d_cols_data = {
# All outpatient visits
'allout_u5' : 'OPD (New and follow-up curative) 0-59m_X',
'allout_ov5' : 'OPD (New and follow-up curative) 5+y_X',
# Malaria admissions
'maladm_u5' : 'Admission - Child with malaria 0-59 months_X',
'maladm_5_14' : 'Admission - Child with malaria 5-14 years_X',
'maladm_ov15' : 'Admission - Malaria 15+ years_X',
# Malaria deaths
'maldth_1_59m' : 'Child death - Malaria 1-59m_X',
'maldth_10_14' : 'Child death - Malaria 10-14y_X',
'maldth_5_9' : 'Child death - Malaria 5-9y_X',
'maldth_fem_ov15' : 'Death malaria 15+ years Female',
'maldth_mal_ov15' : 'Death malaria 15+ years Male',
'maldth_u5' : 'Separation - Child with malaria 0-59 months_X Death',
'maldth_5_14' : 'Separation - Child with malaria 5-14 years_X Death',
'maldth_ov15' : 'Separation - Malaria 15+ years_X Death',
# Suspected cases
'susp_hf_u5' : 'Fever case - suspected Malaria 0-59m_X',
'susp_hf_5_14' : 'Fever case - suspected Malaria 5-14y_X',
'susp_hf_ov15' : 'Fever case - suspected Malaria 15+y_X',
'susp_com_u5' : 'Fever case in community (Suspected Malaria) 0-59m_X',
'susp_com_5_14' : 'Fever case in community (Suspected Malaria) 5-14y_X',
'susp_com_ov15' : 'Fever case in community (Suspected Malaria) 15+y_X',
# Malaria tests conducted at community level
'test_neg_rdt_u5_com' : 'Fever case in community tested for Malaria (RDT) - Negative 0-59m_X',
'test_pos_rdt_u5_com' : 'Fever case in community tested for Malaria (RDT) - Positive 0-59m_X',
'test_neg_rdt_5_14_com' : 'Fever case in community tested for Malaria (RDT) - Negative 5-14y_X',
'test_pos_rdt_5_14_com' : 'Fever case in community tested for Malaria (RDT) - Positive 5-14y_X',
'test_neg_rdt_ov15_com' : 'Fever case in community tested for Malaria (RDT) - Negative 15+y_X',
'test_pos_rdt_ov15_com' : 'Fever case in community tested for Malaria (RDT) - Positive 15+y_X',
# Malaria tests conducted at health facility level
'test_neg_mic_u5_hf' : 'Fever case tested for Malaria (Microscopy) - Negative 0-59m_X',
'test_pos_mic_u5_hf' : 'Fever case tested for Malaria (Microscopy) - Positive 0-59m_X',
'test_neg_mic_5_14_hf' : 'Fever case tested for Malaria (Microscopy) - Negative 5-14y_X',
'test_pos_mic_5_14_hf' : 'Fever case tested for Malaria (Microscopy) - Positive 5-14y_X',
'test_neg_mic_ov15_hf' : 'Fever case tested for Malaria (Microscopy) - Negative 15+y_X',
'test_pos_mic_ov15_hf' : 'Fever case tested for Malaria (Microscopy) - Positive 15+y_X',
'test_neg_rdt_u5_hf' : 'Fever case tested for Malaria (RDT) - Negative 0-59m_X',
'test_pos_rdt_u5_hf' : 'Fever case tested for Malaria (RDT) - Positive 0-59m_X',
'test_neg_rdt_5_14_hf' : 'Fever case tested for Malaria (RDT) - Negative 5-14y_X',
'test_pos_rdt_5_14_hf' : 'Fever case tested for Malaria (RDT) - Positive 5-14y_X',
'test_neg_rdt_ov15_hf' : 'Fever case tested for Malaria (RDT) - Negative 15+y_X',
'test_pos_rdt_ov15_hf' : 'Fever case tested for Malaria (RDT) - Positive 15+y_X',
# Malaria tests conducted at community level
'maltreat_u24_u5_com' : 'Malaria treated in community with ACT <24 hours 0-59m_X',
'maltreat_ov24_u5_com' : 'Malaria treated in community with ACT >24 hours 0-59m_X',
'maltreat_u24_5_14_com' : 'Malaria treated in community with ACT <24 hours 5-14y_X',
'maltreat_ov24_5_14_com' : 'Malaria treated in community with ACT >24 hours 5-14y_X',
'maltreat_u24_ov15_com' : 'Malaria treated in community with ACT <24 hours 15+y_X',
'maltreat_ov24_ov15_com' : 'Malaria treated in community with ACT >24 hours 15+y_X',
# Malaria tests conducted at community level
'maltreat_u24_u5_hf' : 'Malaria treated with ACT <24 hours 0-59m_X',
'maltreat_ov24_u5_hf' : 'Malaria treated with ACT >24 hours 0-59m_X',
'maltreat_u24_5_14_hf' : 'Malaria treated with ACT <24 hours 5-14y_X',
'maltreat_ov24_5_14_hf' : 'Malaria treated with ACT >24 hours 5-14y_X',
'maltreat_u24_ov15_hf' : 'Malaria treated with ACT <24 hours 15+y_X',
'maltreat_ov24_ov15_hf' : 'Malaria treated with ACT >24 hours 15+y_X'
}To adapt the code:
- Lines XXX: Modify the column names (add, edit, remove) to match those specific to your country’s data dictionary and its SNT needs
- Once updated, run the code
Step 3.2: Apply renaming
Now that the renaming rules are defined, we rename columns.
To adapt the code:
- Line 16: Change
"english/data_r"to your current working directory - Once updated, run the code
Show the code
# rename columns
for d in [d_cols_admin, d_cols_data]:
temp = pd.DataFrame(d.items(), columns = ['new_name', 'old_name'])
dhis2_df = dhis2_df.rename(columns = dict(zip(temp['old_name'], temp['new_name'])))
# remove unecessary columns
cols_to_remove = ['orgunitlevel5']
dhis2_df = dhis2_df.drop(cols_to_remove, axis = 1)
# Inspect results
dhis2_df.head(10).style| adm0 | adm1 | adm2 | adm3 | hf | YM | allout_u5 | allout_ov5 | maladm_u5 | maladm_5_14 | maladm_ov15 | maldth_u5 | maldth_5_14 | maldth_ov15 | susp_hf_u5 | susp_hf_5_14 | susp_hf_ov15 | susp_com_u5 | susp_com_5_14 | susp_com_ov15 | maldth_fem_ov15 | maldth_mal_ov15 | maldth_1_59m | maldth_10_14 | maldth_5_9 | test_neg_rdt_u5_com | test_pos_rdt_u5_com | test_neg_rdt_5_14_com | test_pos_rdt_5_14_com | test_neg_rdt_ov15_com | test_pos_rdt_ov15_com | test_neg_mic_u5_hf | test_pos_mic_u5_hf | test_neg_mic_5_14_hf | test_pos_mic_5_14_hf | test_neg_mic_ov15_hf | test_pos_mic_ov15_hf | test_neg_rdt_u5_hf | test_pos_rdt_u5_hf | test_neg_rdt_5_14_hf | test_pos_rdt_5_14_hf | test_neg_rdt_ov15_hf | test_pos_rdt_ov15_hf | maltreat_u24_u5_com | maltreat_ov24_u5_com | maltreat_u24_5_14_com | maltreat_ov24_5_14_com | maltreat_u24_ov15_com | maltreat_ov24_ov15_com | maltreat_u24_u5_hf | maltreat_ov24_u5_hf | maltreat_u24_5_14_hf | maltreat_ov24_5_14_hf | maltreat_u24_ov15_hf | maltreat_ov24_ov15_hf | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Sierra Leone | Moyamba District | Moyamba District Council | Kori Chiefdom | Bai Largo MCHP | January 2015 | 44.000000 | 43.000000 | nan | nan | nan | nan | nan | nan | 21.000000 | 4.000000 | 4.000000 | 10.000000 | 7.000000 | 4.000000 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 21.000000 | nan | 4.000000 | nan | 4.000000 | 15.000000 | 6.000000 | 5.000000 | 2.000000 | 3.000000 | 1.000000 | 18.000000 | 3.000000 | 4.000000 | nan | 2.000000 | nan |
| 1 | Sierra Leone | Moyamba District | Moyamba District Council | Kori Chiefdom | Bai Largo MCHP | February 2015 | 49.000000 | 34.000000 | nan | nan | nan | nan | nan | nan | 37.000000 | 3.000000 | 4.000000 | 10.000000 | 7.000000 | 4.000000 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 10.000000 | 27.000000 | nan | 3.000000 | nan | 4.000000 | 15.000000 | 6.000000 | 5.000000 | 2.000000 | 3.000000 | 1.000000 | 17.000000 | 4.000000 | 2.000000 | nan | 2.000000 | 1.000000 |
| 2 | Sierra Leone | Moyamba District | Moyamba District Council | Kori Chiefdom | Bai Largo MCHP | March 2015 | 102.000000 | 83.000000 | nan | nan | nan | nan | nan | nan | 46.000000 | 5.000000 | 14.000000 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 36.000000 | nan | 3.000000 | nan | 8.000000 | nan | nan | nan | nan | nan | nan | 36.000000 | nan | 3.000000 | nan | 5.000000 | nan |
| 3 | Sierra Leone | Moyamba District | Moyamba District Council | Kori Chiefdom | Bai Largo MCHP | April 2015 | 68.000000 | 68.000000 | nan | nan | nan | nan | nan | nan | 44.000000 | 10.000000 | 10.000000 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 9.000000 | 35.000000 | 4.000000 | 6.000000 | 5.000000 | 5.000000 | nan | nan | nan | nan | nan | nan | 27.000000 | 8.000000 | 4.000000 | 2.000000 | 3.000000 | 2.000000 |
| 4 | Sierra Leone | Moyamba District | Moyamba District Council | Kori Chiefdom | Bai Largo MCHP | May 2015 | 118.000000 | 76.000000 | nan | nan | nan | nan | nan | nan | 105.000000 | 15.000000 | 22.000000 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 20.000000 | 85.000000 | 4.000000 | 11.000000 | 5.000000 | 17.000000 | nan | nan | nan | nan | nan | nan | 67.000000 | 18.000000 | 7.000000 | 4.000000 | 10.000000 | 7.000000 |
| 5 | Sierra Leone | Moyamba District | Moyamba District Council | Kori Chiefdom | Bai Largo MCHP | June 2015 | 145.000000 | 89.000000 | nan | nan | nan | nan | nan | nan | 110.000000 | 10.000000 | 21.000000 | 10.000000 | 11.000000 | 9.000000 | nan | nan | nan | nan | nan | 2.000000 | 8.000000 | 1.000000 | 10.000000 | 1.000000 | 8.000000 | nan | nan | nan | nan | nan | nan | 24.000000 | 86.000000 | 6.000000 | 4.000000 | 10.000000 | 11.000000 | 13.000000 | 4.000000 | 9.000000 | 1.000000 | 6.000000 | 1.000000 | 70.000000 | 16.000000 | 3.000000 | 1.000000 | 6.000000 | 5.000000 |
| 6 | Sierra Leone | Moyamba District | Moyamba District Council | Kori Chiefdom | Bai Largo MCHP | July 2015 | 95.000000 | 94.000000 | nan | nan | nan | nan | nan | nan | 67.000000 | 31.000000 | 26.000000 | 10.000000 | 13.000000 | 9.000000 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 17.000000 | 50.000000 | 9.000000 | 22.000000 | 8.000000 | 18.000000 | 20.000000 | nan | 10.000000 | nan | 8.000000 | nan | 43.000000 | 6.000000 | 13.000000 | 5.000000 | 10.000000 | 5.000000 |
| 7 | Sierra Leone | Moyamba District | Moyamba District Council | Kori Chiefdom | Bai Largo MCHP | August 2015 | 86.000000 | 63.000000 | nan | nan | nan | nan | nan | nan | 72.000000 | 6.000000 | 15.000000 | 15.000000 | 17.000000 | 14.000000 | nan | nan | nan | nan | nan | 2.000000 | 13.000000 | 3.000000 | 14.000000 | 2.000000 | 12.000000 | nan | nan | nan | nan | nan | nan | 8.000000 | 64.000000 | nan | 4.000000 | nan | 15.000000 | 20.000000 | 4.000000 | 11.000000 | 3.000000 | 6.000000 | 6.000000 | 52.000000 | 12.000000 | 3.000000 | 1.000000 | 11.000000 | 4.000000 |
| 8 | Sierra Leone | Moyamba District | Moyamba District Council | Kori Chiefdom | Bai Largo MCHP | September 2015 | 117.000000 | 94.000000 | nan | nan | nan | nan | nan | nan | 66.000000 | 15.000000 | 24.000000 | 16.000000 | 19.000000 | 16.000000 | nan | nan | nan | nan | nan | 2.000000 | 14.000000 | 3.000000 | 16.000000 | 2.000000 | 14.000000 | nan | nan | nan | nan | nan | nan | 16.000000 | 50.000000 | 9.000000 | 6.000000 | 2.000000 | 22.000000 | 22.000000 | 4.000000 | 13.000000 | 3.000000 | 6.000000 | 3.000000 | 42.000000 | 8.000000 | 3.000000 | 3.000000 | 5.000000 | 7.000000 |
| 9 | Sierra Leone | Moyamba District | Moyamba District Council | Kori Chiefdom | Bai Largo MCHP | October 2015 | 97.000000 | 68.000000 | nan | nan | nan | nan | nan | nan | 84.000000 | 5.000000 | 11.000000 | 16.000000 | 5.000000 | 10.000000 | nan | nan | nan | nan | nan | 5.000000 | 11.000000 | 4.000000 | 1.000000 | 4.000000 | 6.000000 | nan | nan | nan | nan | nan | nan | 11.000000 | 73.000000 | 2.000000 | 3.000000 | 3.000000 | 8.000000 | 23.000000 | 3.000000 | 1.000000 | nan | 4.000000 | nan | 66.000000 | 7.000000 | 2.000000 | 1.000000 | 5.000000 | 3.000000 |
Step 4: (If needed) Compute variables
Step 4.1: Compute indicator totals and new variables
Now that we have imported our DHIS2 data and cleaned up the columns, we will compute derived variables to create totals for specific indicators. DHIS2 can provide totals, but these columns must specifically be extracted. In the raw DHIS2 dataset, indicators are typically disaggregated by age group, community, and health facility level. Analysts should strive to obtain the most disaggregated databases possible.
For example, if we want to calculate the total number of outpatient visits, DHIS2 may not provide a direct total, but includes components that can be summed, such as allout_u5 and allout_ov5. Remember to review variable definitions to ensure your aggregations do not double count cases.
The same logic applies to other indicators, including total suspected cases (susp), tested cases (test), confirmed cases (conf), treated cases (maltreat), and presumed cases (pres). The code snippet below shows how to compute these totals by combining the relevant components. The code also includes indicator aggregation by age groups, such as test_hf_u5 which captures all children under five tested by either RDT or microscopy at a health facility.
It is best to verify with the SNT team which specific data elements from DHIS2 should be summed to obtain the correct total. While some calculations may seem obvious, others are not. The box in Step 1.2 lists example questions that the analyst should get verification for before summing across disaggregated data elements. Depending on your country’s DHIS2 and data reporting practice, other questions may also be relevant.
While some countries have presumed cases in their DHIS2 data, not all do! Here are three options for presumed cases that countries have used:
- Option 1: Your data already has a
prescolumn, no further calculation is needed. - Option 2: Calculate presumed cases using the difference between treated cases and confirmed cases:
maltreat-conf - Option 3: Calculate presumed cases using the difference between suspected cases and tested cases:
susp-test
The code below uses Option 2 (lines 52 to 55). Testing is subject to resource availability meaning facilities handle presumed cases differently.
Consult the SNT team to determine how best to approach presumed cases calculations, whether it’s one of the options shown above or a different approach.
Below we create to new functions: fallback_row_sum() and fallback_diff(). The fallback_row_sum() function replaces rowSums(..., na.rm = TRUE), which returns 0 when all values in a row are NA. This default behavior can obscure missingness.
In routine health data, it’s critical to distinguish between:
- True zeros: e.g. a facility reported 0 cases
- Missing values: e.g. the facility did not report at all
fallback_row_sum() returns NA when too few values are present, preserving this distinction and preventing overestimation of completeness.
Similarly, fallback_diff() returns the absolute difference if both values are present, the non-missing value if only one is present, and NA if both are missing. It applies pmax() to ensure results meet a minimum threshold.
Both functions guard against misleading outputs when data are incomplete.
Show the code
# smart row-wise sum with missing data handling
fallback_row_sum <- function(..., min_present = 1, .keep_zero_as_zero = TRUE) {
vars_matrix <- cbind(...)
valid_count <- rowSums(!is.na(vars_matrix))
raw_sum <- rowSums(vars_matrix, na.rm = TRUE)
ifelse(valid_count >= min_present, raw_sum, NA_real_)
}
# fallback absolute difference between two vectors
fallback_diff <- function(col1, col2, minimum = 0) {
dplyr::case_when(
is.na(col1) & is.na(col2) ~ NA_real_,
is.na(col1) ~ pmax(col2, minimum),
is.na(col2) ~ pmax(col1, minimum),
TRUE ~ pmax(col1 - col2, minimum)
)
}
# compute indicator totals in DHIS2 data
dhis2_df <- dhis2_df |>
dplyr::mutate(
# outpatient visits
allout = fallback_row_sum(allout_u5, allout_ov5),
# suspected cases
susp = fallback_row_sum(
susp_u5_hf,
susp_5_14_hf,
susp_ov15_hf,
susp_u5_com,
susp_5_14_com,
susp_ov15_com
),
# tested cases
test_hf = fallback_row_sum(
test_neg_mic_u5_hf,
test_pos_mic_u5_hf,
test_neg_mic_5_14_hf,
test_pos_mic_5_14_hf,
test_neg_mic_ov15_hf,
test_pos_mic_ov15_hf,
tes_neg_rdt_u5_hf,
tes_pos_rdt_u5_hf,
tes_neg_rdt_5_14_hf,
tes_pos_rdt_5_14_hf,
tes_neg_rdt_ov15_hf,
tes_pos_rdt_ov15_hf
),
test_com = fallback_row_sum(
tes_neg_rdt_u5_com,
tes_pos_rdt_u5_com,
tes_neg_rdt_5_14_com,
tes_pos_rdt_5_14_com,
tes_neg_rdt_ov15_com,
tes_pos_rdt_ov15_com
),
test = fallback_row_sum(test_hf, test_com),
# confirmed cases (HF and COM)
conf_hf = fallback_row_sum(
test_pos_mic_u5_hf,
test_pos_mic_5_14_hf,
test_pos_mic_ov15_hf,
tes_pos_rdt_u5_hf,
tes_pos_rdt_5_14_hf,
tes_pos_rdt_ov15_hf
),
conf_com = fallback_row_sum(
tes_pos_rdt_u5_com,
tes_pos_rdt_5_14_com,
tes_pos_rdt_ov15_com
),
conf = fallback_row_sum(conf_hf, conf_com),
# treated cases
maltreat_com = fallback_row_sum(
maltreat_u24_u5_com,
maltreat_ov24_u5_com,
maltreat_u24_5_14_com,
maltreat_ov24_5_14_com,
maltreat_u24_ov15_com,
maltreat_ov24_ov15_com
),
maltreat_hf = fallback_row_sum(
maltreat_u24_u5_hf,
maltreat_ov24_u5_hf,
maltreat_u24_5_14_hf,
maltreat_ov24_5_14_hf,
maltreat_u24_ov15_hf,
maltreat_ov24_ov15_hf
),
maltreat = fallback_row_sum(maltreat_hf, maltreat_com),
# presumed cases
pres_com = fallback_diff(maltreat_com, conf_com),
pres_hf = fallback_diff(maltreat_hf, conf_hf),
pres = fallback_row_sum(pres_com, pres_hf),
# malaria admissions
maladm = fallback_row_sum(
maladm_u5,
maladm_5_14,
maladm_ov15
),
# malaria deaths
maldth = fallback_row_sum(
maldth_u5,
maldth_1_59m,
maldth_10_14,
maldth_5_9,
maldth_5_14,
maldth_ov15,
maldth_fem_ov15,
maldth_mal_ov15
),
# AGE-GROUP SPECIFIC AGGREGATIONS
# Tested cases by age group (HF only)
test_hf_u5 = fallback_row_sum(
test_neg_mic_u5_hf,
test_pos_mic_u5_hf,
tes_neg_rdt_u5_hf,
tes_pos_rdt_u5_hf
),
test_hf_5_14 = fallback_row_sum(
test_neg_mic_5_14_hf,
test_pos_mic_5_14_hf,
tes_neg_rdt_5_14_hf,
tes_pos_rdt_5_14_hf
),
test_hf_ov15 = fallback_row_sum(
test_neg_mic_ov15_hf,
test_pos_mic_ov15_hf,
tes_neg_rdt_ov15_hf,
tes_pos_rdt_ov15_hf
),
# Tested cases by age group (Community only)
test_com_u5 = fallback_row_sum(
tes_neg_rdt_u5_com,
tes_pos_rdt_u5_com
),
test_com_5_14 = fallback_row_sum(
tes_neg_rdt_5_14_com,
tes_pos_rdt_5_14_com
),
test_com_ov15 = fallback_row_sum(
tes_neg_rdt_ov15_com,
tes_pos_rdt_ov15_com
),
# Total tested by age group (HF + Community)
test_u5 = fallback_row_sum(test_hf_u5, test_com_u5),
test_5_14 = fallback_row_sum(test_hf_5_14, test_com_5_14),
test_ov15 = fallback_row_sum(test_hf_ov15, test_com_ov15),
# Suspected cases by age group (HF only)
susp_hf_u5 = susp_u5_hf,
susp_hf_5_14 = susp_5_14_hf,
susp_hf_ov15 = susp_ov15_hf,
# Suspected cases by age group (Community only)
susp_com_u5 = susp_u5_com,
susp_com_5_14 = susp_5_14_com,
susp_com_ov15 = susp_ov15_com,
# Total suspected by age group (HF + Community)
susp_u5 = fallback_row_sum(susp_hf_u5, susp_com_u5),
susp_5_14 = fallback_row_sum(susp_hf_5_14, susp_com_5_14),
susp_ov15 = fallback_row_sum(susp_hf_ov15, susp_com_ov15),
# Confirmed cases by age group (HF only)
conf_hf_u5 = fallback_row_sum(
test_pos_mic_u5_hf,
tes_pos_rdt_u5_hf
),
conf_hf_5_14 = fallback_row_sum(
test_pos_mic_5_14_hf,
tes_pos_rdt_5_14_hf
),
conf_hf_ov15 = fallback_row_sum(
test_pos_mic_ov15_hf,
tes_pos_rdt_ov15_hf
),
# Confirmed cases by age group (Community only)
conf_com_u5 = tes_pos_rdt_u5_com,
conf_com_5_14 = tes_pos_rdt_5_14_com,
conf_com_ov15 = tes_pos_rdt_ov15_com,
# Total confirmed by age group (HF + Community)
conf_u5 = fallback_row_sum(conf_hf_u5, conf_com_u5),
conf_5_14 = fallback_row_sum(conf_hf_5_14, conf_com_5_14),
conf_ov15 = fallback_row_sum(conf_hf_ov15, conf_com_ov15),
# Treated cases by age group (HF only)
maltreat_hf_u5 = fallback_row_sum(
maltreat_u24_u5_hf,
maltreat_ov24_u5_hf
),
maltreat_hf_5_14 = fallback_row_sum(
maltreat_u24_5_14_hf,
maltreat_ov24_5_14_hf
),
maltreat_hf_ov15 = fallback_row_sum(
maltreat_u24_ov15_hf,
maltreat_ov24_ov15_hf
),
# Treated cases by age group (Community only)
maltreat_com_u5 = fallback_row_sum(
maltreat_u24_u5_com,
maltreat_ov24_u5_com
),
maltreat_com_5_14 = fallback_row_sum(
maltreat_u24_5_14_com,
maltreat_ov24_5_14_com
),
maltreat_com_ov15 = fallback_row_sum(
maltreat_u24_ov15_com,
maltreat_ov24_ov15_com
),
# Total treated by age group (HF + Community)
maltreat_u5 = fallback_row_sum(maltreat_hf_u5, maltreat_com_u5),
maltreat_5_14 = fallback_row_sum(maltreat_hf_5_14, maltreat_com_5_14),
maltreat_ov15 = fallback_row_sum(maltreat_hf_ov15, maltreat_com_ov15),
# Total treated within 24 hours (HF only)
maltreat_u24_hf = fallback_row_sum(
maltreat_u24_u5_hf,
maltreat_u24_5_14_hf,
maltreat_u24_ov15_hf
),
# Total treated after 24 hours (HF only)
maltreat_ov24_hf = fallback_row_sum(
maltreat_ov24_u5_hf,
maltreat_ov24_5_14_hf,
maltreat_ov24_ov15_hf
),
# Total treated within 24 hours (Community only)
maltreat_u24_com = fallback_row_sum(
maltreat_u24_u5_com,
maltreat_u24_5_14_com,
maltreat_u24_ov15_com
),
# Total treated after 24 hours (Community only)
maltreat_ov24_com = fallback_row_sum(
maltreat_ov24_u5_com,
maltreat_ov24_5_14_com,
maltreat_ov24_ov15_com
),
# Overall totals (HF + Community)
maltreat_u24_total = fallback_row_sum(maltreat_u24_hf, maltreat_u24_com),
maltreat_ov24_total = fallback_row_sum(maltreat_ov24_hf, maltreat_ov24_com),
# Presumed cases by age group
pres_com_u5 = fallback_diff(maltreat_com_u5, conf_com_u5),
pres_com_5_14 = fallback_diff(maltreat_com_5_14, conf_com_5_14),
pres_com_ov15 = fallback_diff(maltreat_com_ov15, conf_com_ov15),
pres_hf_u5 = fallback_diff(maltreat_hf_u5, conf_hf_u5),
pres_hf_5_14 = fallback_diff(maltreat_hf_5_14, conf_hf_5_14),
pres_hf_ov15 = fallback_diff(maltreat_hf_ov15, conf_hf_ov15),
pres_u5 = fallback_row_sum(pres_com_u5, pres_hf_u5),
pres_5_14 = fallback_row_sum(pres_com_5_14, pres_hf_5_14),
pres_ov15 = fallback_row_sum(pres_com_ov15, pres_hf_ov15)
)To adapt the code: - Lines 4–64: Adjust the variable names to reflect those relevant to your dataset when calculating new variables - Once updated, run the code
Show the code
d_sums = {
# outpatient visits
'allout' : ['allout_u5',
'allout_ov5'],
# suspected cases
'susp' : ['susp_hf_u5',
'susp_hf_5_14',
'susp_hf_ov15',
'susp_com_u5',
'susp_com_5_14',
'susp_com_ov15'],
# tested cases
'test_hf' : ['test_neg_mic_u5_hf',
'test_pos_mic_u5_hf',
'test_neg_mic_5_14_hf',
'test_pos_mic_5_14_hf',
'test_neg_mic_ov15_hf',
'test_pos_mic_ov15_hf',
'test_neg_rdt_u5_hf',
'test_pos_rdt_u5_hf',
'test_neg_rdt_5_14_hf',
'test_pos_rdt_5_14_hf',
'test_neg_rdt_ov15_hf',
'test_pos_rdt_ov15_hf'],
'test_com' : ['test_neg_rdt_u5_com',
'test_pos_rdt_u5_com',
'test_neg_rdt_5_14_com',
'test_pos_rdt_5_14_com',
'test_neg_rdt_ov15_com',
'test_pos_rdt_ov15_com'],
'test' : ['test_hf',
'test_com'],
# confirmed cases (HF and COM)
'conf_hf' : ['test_pos_mic_u5_hf',
'test_pos_mic_5_14_hf',
'test_pos_mic_ov15_hf',
'test_pos_rdt_u5_hf',
'test_pos_rdt_5_14_hf',
'test_pos_rdt_ov15_hf'],
'conf_com' : ['test_pos_rdt_u5_com',
'test_pos_rdt_5_14_com',
'test_pos_rdt_ov15_com'],
'conf' : ['conf_hf',
'conf_com'],
# treated cases
'maltreat_com' : ['maltreat_u24_u5_com',
'maltreat_ov24_u5_com',
'maltreat_u24_5_14_com',
'maltreat_ov24_5_14_com',
'maltreat_u24_ov15_com',
'maltreat_ov24_ov15_com'],
'maltreat_hf' : ['maltreat_u24_u5_hf',
'maltreat_ov24_u5_hf',
'maltreat_u24_5_14_hf',
'maltreat_ov24_5_14_hf',
'maltreat_u24_ov15_hf',
'maltreat_ov24_ov15_hf'],
'maltreat' : ['maltreat_hf',
'maltreat_com'],
# malaria admissions
'maladm' : ['maladm_u5',
'maladm_5_14',
'maladm_ov15'],
# malaria deaths
'maldth' : ['maldth_u5',
'maldth_1_59m',
'maldth_10_14',
'maldth_5_9',
'maldth_5_14',
'maldth_ov15',
'maldth_fem_ov15',
'maldth_mal_ov15'],
# AGE-GROUP SPECIFIC AGGREGATIONS
# Tested cases by age group (HF only)
'test_hf_u5' : ['test_neg_mic_u5_hf',
'test_pos_mic_u5_hf',
'test_neg_rdt_u5_hf',
'test_pos_rdt_u5_hf'],
'test_hf_5_14' : ['test_neg_mic_5_14_hf',
'test_pos_mic_5_14_hf',
'test_neg_rdt_5_14_hf',
'test_pos_rdt_5_14_hf'],
'test_hf_ov15' : ['test_neg_mic_ov15_hf',
'test_pos_mic_ov15_hf',
'test_neg_rdt_ov15_hf',
'test_pos_rdt_ov15_hf'],
# Tested cases by age group (Community only)
'test_com_u5' : ['test_neg_rdt_u5_com',
'test_pos_rdt_u5_com'],
'test_com_5_14' : ['test_neg_rdt_5_14_com',
'test_pos_rdt_5_14_com'],
'test_com_ov15' : ['test_neg_rdt_ov15_com',
'test_pos_rdt_ov15_com'],
# Total tested by age group (HF + Community)
'test_u5' : ['test_hf_u5',
'test_com_u5'],
'test_5_14' : ['test_hf_5_14',
'test_com_5_14'],
'test_ov15' : ['test_hf_ov15',
'test_com_ov15'],
# Suspected cases by age group
# Total suspected by age group (HF + Community)
'susp_u5' : ['susp_hf_u5',
'susp_com_u5'],
'susp_5_14' : ['susp_hf_5_14',
'susp_com_5_14'],
'susp_ov15' : ['susp_hf_ov15',
'susp_com_ov15'],
# Confirmed cases by age group (HF only)
'conf_hf_u5' : ['test_pos_mic_u5_hf',
'test_pos_rdt_u5_hf'],
'conf_hf_5_14' : ['test_pos_mic_5_14_hf',
'test_pos_rdt_5_14_hf'],
'conf_hf_ov15' : ['test_pos_mic_ov15_hf',
'test_pos_rdt_ov15_hf'],
# Confirmed cases by age group (Community only)
'conf_com_u5' : ['test_pos_rdt_u5_com'],
'conf_com_5_14' : ['test_pos_rdt_5_14_com'],
'conf_com_ov15' : ['test_pos_rdt_ov15_com'],
# Total confirmed by age group (HF + Community)
'conf_u5' : ['conf_hf_u5',
'conf_com_u5'],
'conf_5_14' : ['conf_hf_5_14',
'conf_com_5_14'],
'conf_ov15' : ['conf_hf_ov15',
'conf_com_ov15'],
# Treated cases by age group (HF only)
'maltreat_hf_u5' : ['maltreat_u24_u5_hf',
'maltreat_ov24_u5_hf'],
'maltreat_hf_5_14' : ['maltreat_u24_5_14_hf',
'maltreat_ov24_5_14_hf'],
'maltreat_hf_ov15' : ['maltreat_u24_ov15_hf',
'maltreat_ov24_ov15_hf'],
# Treated cases by age group (Community only)
'maltreat_com_u5' : ['maltreat_u24_u5_com',
'maltreat_ov24_u5_com'],
'maltreat_com_5_14' : ['maltreat_u24_5_14_com',
'maltreat_ov24_5_14_com'],
'maltreat_com_ov15' : ['maltreat_u24_ov15_com',
'maltreat_ov24_ov15_com'],
# Total treated by age group (HF + Community)
'maltreat_u5' : ['maltreat_hf_u5',
'maltreat_com_u5'],
'maltreat_5_14' : ['maltreat_hf_5_14',
'maltreat_com_5_14'],
'maltreat_ov15' : ['maltreat_hf_ov15',
'maltreat_com_ov15'],
# Total treated within 24 hours (HF only)
'maltreat_u24_hf' : ['maltreat_u24_u5_hf',
'maltreat_u24_5_14_hf',
'maltreat_u24_ov15_hf'],
# Total treated after 24 hours (HF only)
'maltreat_ov24_hf' : ['maltreat_ov24_u5_hf',
'maltreat_ov24_5_14_hf',
'maltreat_ov24_ov15_hf'],
# Total treated within 24 hours (Community only)
'maltreat_u24_com' : ['maltreat_u24_u5_com',
'maltreat_u24_5_14_com',
'maltreat_u24_ov15_com'],
# Total treated after 24 hours (Community only)
'maltreat_ov24_com' : ['maltreat_ov24_u5_com',
'maltreat_ov24_5_14_com',
'maltreat_ov24_ov15_com'],
# Overall totals (HF + Community)
'maltreat_u24_total' : ['maltreat_u24_hf',
'maltreat_u24_com'],
'maltreat_ov24_total' : ['maltreat_ov24_hf',
'maltreat_ov24_com'],
}
for col_sum, cols in d_sums.items():
# create a new column with the sum of the columns in the list
dhis2_df[col_sum] = dhis2_df[cols].sum(axis = 1, skipna = True, min_count = 1)
# substractions
# presumed cases
# The order of the two columns in the dictionary is important.
# If we want to perform the operation pres = maltreat-conf, then we need to prepare the dictionary as : d = {'pres': ['maltreat', 'conf']}
d_pres = {'pres_com' : ['maltreat_com', 'conf_com'],
'pres_hf' : ['maltreat_hf', 'conf_hf'],
# Presumed cases by age group
'pres_com_u5' : ['maltreat_com_u5', 'conf_com_u5'],
'pres_com_5_14' : ['maltreat_com_5_14', 'conf_com_5_14'],
'pres_com_ov15' : ['maltreat_com_ov15', 'conf_com_ov15'],
'pres_hf_u5' : ['maltreat_hf_u5', 'conf_hf_u5'],
'pres_hf_5_14' : ['maltreat_hf_5_14', 'conf_hf_5_14'],
'pres_hf_ov15' : ['maltreat_hf_ov15', 'conf_hf_ov15'],
}
for col_pres, cols in d_pres.items():
# create a new column with the difference between the two columns in the list
dhis2_df[col_pres] = dhis2_df[cols[0]].sub(dhis2_df[cols[1]])
# ensure that we do not have negative values
dhis2_df[col_pres] = dhis2_df[col_pres].clip(lower=0)
dhis2_df['pres'] = dhis2_df[['pres_com', 'pres_hf']].sum(axis = 1, skipna = True, min_count = 1)
dhis2_df['pres_u5'] = dhis2_df[['pres_com_u5', 'pres_hf_u5']].sum(axis = 1, skipna = True, min_count = 1)
dhis2_df['pres_5_14'] = dhis2_df[['pres_com_5_14', 'pres_hf_5_14']].sum(axis = 1, skipna = True, min_count = 1)
dhis2_df['pres_ov15'] = dhis2_df[['pres_com_ov15', 'pres_hf_ov15']].sum(axis = 1, skipna = True, min_count = 1)
# Inspect results
dhis2_df.head(10).styleTo adapt the code: - Lines X–X: Adjust the variable names to reflect those relevant to your dataset when calculating new variables - Once updated, run the code
Step 4.2: (If needed) Quality control of indicator totals
Now let’s check that indicator totals are equal to the sum of their disaggregated components. The code chunk below filters and displays rows of the data frame where the indicator total does not equal the sum of its components. For example, rows where allout is not equal to the sum of allout_u5 and allout_ov5. Age group aggregates are also checked in this code.
If all totals were manually summed using the previous step, you may choose to skip this step. However, even if totals were manually calculated in the previous step, checking totals here can help you identify errors.
In the case that total columns are extracted from DHIS2, it is essential to perform this quality control check for coherency. This check confirms which components go into the aggregated totals extracted from DHIS2.
Show the code
# Check indicator totals
dhis2_df |>
filter(
# Outpatient checks
allout != allout_u5 + allout_ov5 |
# Malaria admissions check
maladm != maladm_u5 + maladm_5_14 + maladm_ov15 |
# Total tests check
test != test_hf + test_com |
# Total confirmed check
conf != conf_hf + conf_com |
# Total treated check
maltreat != maltreat_hf + maltreat_com |
# Total presumed check
pres != pres_hf + pres_com |
# Malaria deaths
maldth != maldth_1_59m + maldth_u5 + maldth_5_9 + maldth_10_14 +
maldth_5_14 +maldth_fem_ov15 + maldth_mal_ov15 + maldth_ov15 |
# Tested cases checks
test_u5 != test_hf_u5 + test_com_u5 |
test_5_14 != test_hf_5_14 + test_com_5_14 |
test_ov15 != test_hf_ov15 + test_com_ov15 |
# Confirmed cases checks
conf_u5 != conf_hf_u5 + conf_com_u5 |
conf_5_14 != conf_hf_5_14 + conf_com_5_14 |
conf_ov15 != conf_hf_ov15 + conf_com_ov15 |
# Presumed cases checks
pres_u5 != pres_hf_u5 + pres_com_u5 |
pres_5_14 != pres_hf_5_14 + pres_com_5_14 |
pres_ov15 != pres_hf_ov15 + pres_com_ov15|
# Suspected cases checks
susp_u5 != susp_u5_hf + susp_u5_com |
susp_5_14 != susp_5_14_hf + susp_5_14_com |
susp_ov15 != susp_ov15_hf + susp_ov15_com|
# Treated cases checks
maltreat_u5 != maltreat_hf_u5 + maltreat_com_u5 |
maltreat_5_14 != maltreat_hf_5_14 + maltreat_com_5_14 |
maltreat_ov15 != maltreat_hf_ov15 + maltreat_com_ov15 |
# Treatment timing checks
maltreat_u24_total != maltreat_u24_hf + maltreat_u24_com |
maltreat_ov24_total != maltreat_ov24_hf + maltreat_ov24_com
) |>
select(
hf, periodname, allout,
maldth, maladm, test,
conf, maltreat, pres,
test_u5, test_5_14, test_ov15,
conf_u5, conf_5_14, conf_ov15,
pres_u5, pres_5_14, pres_ov15,
maltreat_u5, maltreat_5_14, maltreat_ov15,
maltreat_u24_total, maltreat_ov24_total
)Step 4.3: indicator totals coherency visualization
Incoherent totals can be visualized to facilitate further investigation. The Sierra Leone example DHIS2 data does not contain any rows with incoherent totals. A deteriorated version of Sierra Leone’s DHIS2 data is used below to exemplify the case of incoherent totals. Any coherency issues demonstrated below are not representative of Sierra Leone’s DHIS2 data quality.
Show the code
# Define component checks for visualization
component_checks <- list(
list(total = "allout", parts = c("allout_u5", "allout_ov5")),
list(total = "maladm", parts = c("maladm_u5", "maladm_5_14", "maladm_ov15")),
list(total = "test", parts = c("test_hf", "test_com")),
list(total = "conf", parts = c("conf_hf", "conf_com")),
list(total = "maltreat", parts = c("maltreat_hf", "maltreat_com")),
list(total = "pres", parts = c("pres_hf", "pres_com"))
)
# Create visualization function
plot_coherence_check <- function(data, facility = NULL, period = NULL) {
# Filter data if specified
if (!is.null(facility) && !is.null(period)) {
data <- data[data$hf == facility & data$periodname == period, ]
}
# Calculate all comparisons
results <- base::lapply(component_checks, function(x) {
if (!x$total %in% base::names(data)) return(NULL)
parts_exist <- x$parts[x$parts %in% base::names(data)]
if (base::length(parts_exist) == 0) return(NULL)
base::data.frame(
indicator = x$total,
reported = data[[x$total]],
calculated = base::rowSums(data[parts_exist], na.rm = TRUE)
)
}) |>
(\(x) base::do.call(base::rbind, base::Filter(base::NROW, x)))()
# Create scatterplot
graphics::plot(
x = results$calculated,
y = results$reported,
xlab = "Calculated Sum of Components",
ylab = "Reported Total",
main = "Data Coherence Check: Reported vs Calculated Totals",
pch = 19,
col = base::ifelse(results$reported == results$calculated, "#4E79A7", "#E15759")
)
# Add identity line
graphics::abline(a = 0, b = 1, col = "darkgray", lty = 2)
# Add legend
graphics::legend("topleft",
legend = c("Coherent", "Incoherent"),
pch = 19,
col = c("#4E79A7", "#E15759"),
bty = "n")
# Label outliers
incoherent <- base::which(results$reported != results$calculated)
if (base::length(incoherent) > 0) {
graphics::text(
x = results$calculated[incoherent],
y = results$reported[incoherent],
labels = results$indicator[incoherent],
pos = 4,
cex = 0.8,
col = "#E15759"
)
}
}Show the code
import matplotlib.pyplot as plt
import seaborn as sns
def plot_totals_check(data, total_col, component_cols, title="Reported vs Calculated Totals"):
"""Plot reported totals against calculated sums of components."""
# Calculate sum of components
calculated = data[component_cols].sum(axis=1)
# Create plot
plt.figure(figsize=(8, 6))
sns.scatterplot(
x=calculated,
y=data[total_col],
hue=(calculated == data[total_col]),
palette={True: "#4E79A7", False: "#E15759"},
s=100,
alpha=0.7
)
# Add identity line
max_val = max(calculated.max(), data[total_col].max())
plt.plot([0, max_val], [0, max_val], '--', color='gray')
# Add labels and title
plt.xlabel("Calculated sum of components")
plt.ylabel("Reported total")
plt.title(title)
# Label mismatches
mismatches = data[calculated != data[total_col]]
for i, row in mismatches.iterrows():
plt.text(
x=calculated[i],
y=data[total_col][i],
s=f"{row['hf']}\n{row['periodname']}",
color="#E15759",
fontsize=8
)
plt.legend().remove()
plt.show()
# List of indicators to check
indicator_checks = [
{"total": "allout", "components": ["allout_u5", "allout_ov5"]},
{"total": "maladm", "components": ["maladm_u5", "maladm_5_14", "maladm_ov15"]},
{"total": "test", "components": ["test_hf", "test_com"]},
{"total": "conf", "components": ["conf_hf", "conf_com"]},
{"total": "maltreat", "components": ["maltreat_hf", "maltreat_com"]},
{"total": "pres", "components": ["pres_hf", "pres_com"]}
]
# Generate all plots
for ind in indicator_checks:
if all(col in dhis2_df.columns for col in [ind["total"]] + ind["components"]):
plot_totals_check(dhis2_df, ind["total"], ind["components"])Incoherent totals must be shared for review with the SNT team as they may signal critical data quality issues. These discrepancies can reflect reporting errors like overcounting or missing component data, but may also reveal systemic problems such as mismatched indicator definitions or inclusions. Regardless of origin, each case requires source-level verification to distinguish between true data errors and legitimate program exceptions. The SNT team can advise you on how to manage incoherent totals.
Step 4.4: Export rows with incoherent totals
Incoherent totals visualized in the previous step can be exported using this code. Exported outputs must be shared with the SNT team for review and guidance.
Show the code
# Identify rows with incoherent totals
incoherent_rows <- dhis2_df_example |>
dplyr::filter(
# Outpatient checks
allout != allout_u5 + allout_ov5 |
# Malaria admissions check
maladm != maladm_u5 + maladm_5_14 + maladm_ov15 |
# Total tests check
test != test_hf + test_com |
# Total confirmed check
conf != conf_hf + conf_com |
# Total treated check
maltreat != maltreat_hf + maltreat_com |
# Total presumed check
pres != pres_hf + pres_com
) |>
dplyr::select(
hf, periodname,
dplyr::matches("allout|maladm|test|conf|maltreat|pres")
)
# Export to CSV with timestamp
output_file <- paste0("incoherent_totals_", format(Sys.time(), "%Y%m%d_%H%M"), ".csv")
readr::write_csv(incoherent_rows, file = output_file)
# Print summary
if (nrow(incoherent_rows) > 0) {
message(
"Found ", nrow(incoherent_rows),
" rows with incoherent totals. Exported to: ", output_file
)
print(incoherent_rows)
} else {
message("No incoherent totals found in the data.")
}Step 5: Create a unique year-month column
Step 5.1: Example data date format
Here we want to end up with a column featuring the report date in format YYYY-MM, which will come in handy for future data merges and other operations. Note, the chunk of code below only applied to the specific SLE example dataset - you need to adapt it to the format of your Year-Month column. Reformatting the date also helps avoid overcrowding on the x-axis when data is plotted and keeps the labels clear and readable.
Show the code
dhis2_df <- dhis2_df |>
# seperate periodname to month and year
tidyr::separate(
col = tidyselect::all_of("periodname"),
into = c("month", "year"),
sep = " ",
remove = TRUE
)
dhis2_df <- dhis2_df |>
# recode month
dplyr::mutate(
month = recode(
month,
"January" = "01",
"February" = "02",
"March" = "03",
"April" = "04",
"May" = "05",
"June" = "06",
"July" = "07",
"August" = "08",
"September" = "09",
"October" = "10",
"November" = "11",
"December" = "12"
)
) |>
# create a new date column
unite("date", year, month, sep = "-", remove = FALSE)To adapt the code:
- Line 4: Change the
periodnamein the code above to match the one in your dataset - Modify the code if the names of months in your data are in another language
- Once updated, run the code
Show the code
# format YM column
# this would only work for the format 'January 2020'
# if the format is different, we need to adjust the format string
# we need to make sure the result is in the format 'YYYY-MM'
dhis2_df['YM'] = pd.to_datetime(dhis2_df['periodname'], format='%B %Y').dt.strftime('%Y-%m')
# Inspect results
dhis2_df.head(10).styleTo adapt the code:
Step 5.2: Handling alternative DHIS2 date formats
The example data illustrates one possible formatting of dates in DHIS2 extractions. However, not all data have this format. The table below outlines different date formats and the respective codes that can be used for reformatting. Example uses of these codes are also provided below the table.
| Format Example | Description | R Approach | Python Approach |
|---|---|---|---|
Janvier 2023 |
Month name + Year (French) | mutate(month = recode(month, "Janvier"="01", "Février"="02", ...)) |
Set locale first, then use %B %Y format |
2023-Jan |
Year-Month abbreviation | ymd(paste0(date_column,"-01")) %>% format("%Y-%m") |
pd.to_datetime(df['YM'], format='%Y-%b') |
01/2023 |
Month/Year (numeric) | my(date_column) %>% format("%Y-%m") |
pd.to_datetime(df['YM'], format='%m/%Y') |
2023-01-15 |
Full date (YYYY-MM-DD) | ymd(date_column) %>% format("%Y-%m") |
pd.to_datetime(df['YM']).dt.strftime('%Y-%m') |
202301 |
Compact YYYYMM format | ymd(paste0(date_column,"01")) %>% format("%Y-%m") |
pd.to_datetime(df['YM'], format='%Y%m') |
Example Usage:
Example 1: French month names
df <- df|>
mutate(month = recode(month,
"Janvier" = "01",
"Février" = "02", # Note accent aigu
"Mars" = "03",
"Avril" = "04",
"Mai" = "05",
"Juin" = "06",
"Juillet" = "07",
"Août" = "08", # Note accent circonflexe
"Septembre" = "09",
"Octobre" = "10",
"Novembre" = "11",
"Décembre" = "12"
)) |>
unite("YM", year, month, sep = "-")Example 2: Portuguese month names
df <- df |>
mutate(month = recode(month,
"Janeiro" = "01",
"Fevereiro" = "02",
"Março" = "03", # Note cedilha (ç) character
"Abril" = "04",
"Maio" = "05",
"Junho" = "06",
"Julho" = "07",
"Agosto" = "08",
"Setembro" = "09",
"Outubro" = "10",
"Novembro" = "11",
"Dezembro" = "12"
)) |>
unite("YM", year, month, sep = "-")Example 3: Compact YYYYMM format
Example 1: French month names:
Example 2: Portuguese month names
Example 3: Compact YYYYMM format:
Step 5.3: Unclear data format
If the date format in your data is not clear, the code below will analyze and extract the format.
Show the code
# Extract the values in the 'period_name' column to process
period_values <- df$period_name
# Initialize empty vectors to store month, year, and year_month ("YYYY-MM") values
month_vec <- integer(length(period_values))
year_vec <- integer(length(period_values))
year_month_vec <- character(length(period_values))
# Loop through each value in the 'period_name' column
for (i in seq_along(period_values)) {
# Clean up the string by trimming spaces
val <- stringr::str_trim(period_values[i])
# Initialize temporary variables for each value
m <- NA_integer_
y <- NA_integer_
ym_str <- NA_character_
# Check for missing or empty values
if (is.na(val) || val == "") {
dt <- NA
# Format: "Jan-2015"
} else if (nchar(val) == 8 && substr(val, 4,4) == "-") {
dt <- tryCatch(lubridate::parse_date_time(val, "b-Y"), error = function(e) NA)
# Format: "Jan 2015"
} else if (nchar(val) == 8 && substr(val, 4,4) == " ") {
dt <- tryCatch(lubridate::parse_date_time(val, "b Y"), error = function(e) NA)
# Format: "January-2015"
} else if (stringr::str_detect(val, "-") && nchar(stringr::word(val, 1, sep = "-")) > 3) {
dt <- tryCatch(lubridate::parse_date_time(val, "B-Y"), error = function(e) NA)
# Format: "January 2015"
} else if (stringr::str_detect(val, " ") && nchar(stringr::word(val, 1, sep = " ")) > 3) {
dt <- tryCatch(lubridate::parse_date_time(val, "B Y"), error = function(e) NA)
# Format: "01-2015"
} else if (stringr::str_detect(val, "-") && grepl("^[0-9]{2}-[0-9]{4}$", val)) {
dt <- tryCatch(lubridate::parse_date_time(val, "m-Y"), error = function(e) NA)
# Format: "01/2015"
} else if (stringr::str_detect(val, "/") && grepl("^[0-9]{2}/[0-9]{4}$", val)) {
dt <- tryCatch(lubridate::parse_date_time(val, "m/Y"), error = function(e) NA)
# Format: "01.2015"
} else if (stringr::str_detect(val, "\\.") && grepl("^[0-9]{2}\\.[0-9]{4}$", val)) {
dt <- tryCatch(lubridate::parse_date_time(val, "m.Y"), error = function(e) NA)
# Format: "2015-01"
} else if (stringr::str_detect(val, "-") && grepl("^[0-9]{4}-[0-9]{2}$", val)) {
dt <- tryCatch(lubridate::parse_date_time(val, "Y-m"), error = function(e) NA)
# Format: "2015/01"
} else if (stringr::str_detect(val, "/") && grepl("^[0-9]{4}/[0-9]{2}$", val)) {
dt <- tryCatch(lubridate::parse_date_time(val, "Y/m"), error = function(e) NA)
# Format: "2015.01"
} else if (stringr::str_detect(val, "\\.") && grepl("^[0-9]{4}\\.[0-9]{2}$", val)) {
dt <- tryCatch(lubridate::parse_date_time(val, "Y.m"), error = function(e) NA)
# Format: "2015 Jan"
} else if (length(stringr::str_split(val, " ", simplify = TRUE)) == 2 && grepl("^[0-9]{4}", val)) {
dt <- tryCatch(lubridate::parse_date_time(val, "Y b"), error = function(e) NA)
# Format: "2015 January"
} else if (length(stringr::str_split(val, " ", simplify = TRUE)) == 2 && grepl("^[0-9]{4}", val)) {
dt <- tryCatch(lubridate::parse_date_time(val, "Y B"), error = function(e) NA)
# Format: "January2015"
} else if (stringr::str_detect(val, "^[A-Za-z]{3,}[0-9]{4}$")) {
dt <- tryCatch(lubridate::parse_date_time(val, "BY"), error = function(e) NA)
# Format: "Jan2015"
} else if (stringr::str_detect(val, "^[A-Za-z]{3}[0-9]{4}$")) {
dt <- tryCatch(lubridate::parse_date_time(val, "bY"), error = function(e) NA)
# Format: "201501"
} else if (nchar(val) == 6 && grepl("^[0-9]{6}$", val)) {
dt <- tryCatch(lubridate::parse_date_time(val, "Ym"), error = function(e) NA)
# Format: "012015"
} else if (nchar(val) == 6 && grepl("^[0-9]{6}$", val)) {
dt <- tryCatch(lubridate::parse_date_time(val, "mY"), error = function(e) NA)
# Format not recognized
} else {
dt <- NA
}
# If parsing succeeded, extract month and year, and build "YYYY-MM" string
if (!is.na(dt)) {
m <- lubridate::month(dt)
y <- lubridate::year(dt)
ym_str <- sprintf("%04d-%02d", y, m)
}
# Save values into pre-created vectors
month_vec[i] <- m
year_vec[i] <- y
year_month_vec[i] <- ym_str
}
# Add extracted values as new columns in the dataframe
df <- df |>
dplyr::mutate(
month = month_vec,
year = year_vec,
year_month = year_month_vec
)Step 6: (If needed) Create a unique ID
Step 6.1: Create unique IDs for admin units
VT: we could simplify this section by calling the hf column adm4 (or whatever level it is) and use the same UID column creation function for all levels in one go, instead of doing the same thing in two sections (one for admin uits, one for hf). Happy to discuss
Show the code
# Set admin levels (EDIT THESE)
min_level <- 1 # Set highest admin unit level available in dataset (usually the Country, or Region/Province)
hf_level <- 5 # Set health facility admin unit level in the dataset (usually the lowest one)
# Core UID creation function
create_uid <- function(df, level) {
cols <- paste0("adm", 1:level)
df |>
dplyr::arrange(across(dplyr::all_of(cols))) |>
dplyr::group_by(across(dplyr::all_of(cols))) |>
dplyr::mutate(
"{paste0('adm', level, '_uid')}" := paste0(
"adm", level, "_",
sprintf("%05d", dplyr::cur_group_id())
)
) |>
dplyr::ungroup()
}
# Create all UIDs
for (level in min_level:hf_level) {
df <- create_uid(df, level)
}
# Column reordering (matches Python logic)
base_cols <- lapply(
min_level:hf_level,
\(i) c(paste0("adm", i), paste0("adm", i, "_uid"))
) |>
unlist()
cols_data <- df |>
names() |>
setdiff(c("YM", base_cols)) |>
sort()
# Final output (identical to Python)
df <- df |>
dplyr::select(dplyr::all_of(c("YM", base_cols, cols_data))) |>
dplyr::arrange(across(dplyr::all_of(base_cols)))
# Verify first 10 rows
utils::head(df, 10)Show the code
# If your data does NOT contain admin unit UIDs:
def create_uid(df, level):
cols = [f'adm{i}' for i in range(0, level+1)]
# sort admin units alphabetically
df = df.sort_values(by = cols).reset_index(drop = True)
# assign a unique number to each admin unit at this level, starting at 1
df[f'adm{level}_uid'] = df.groupby(cols).ngroup() + 1
# format the uid to a string with leading zeros, to ensure the number of characters in each uid is identical
df[f'adm{level}_uid'] = df[f'adm{level}_uid'].apply(lambda x: f'adm{level}_{str(x).zfill(5)}')
return df
# create UID columns
admin_unit_levels = [0, 1, 2, 3]
for level in admin_unit_levels:
dhis2_df = create_uid(dhis2_df, level)
# reorder columns for display
base_cols = [[f'adm{i}', f'adm{i}_uid'] for i in admin_unit_levels]
base_cols = [i for j in base_cols for i in j]
base_cols = base_cols + ['hf']
base_cols = ['YM'] + base_cols
cols_data = sorted([c for c in dhis2_df.columns if c not in base_cols])
cols = base_cols + cols_data
dhis2_df = (dhis2_df[cols]
.sort_values(by = base_cols)
.reset_index(drop = True))
dhis2_df.head(10).styleTo adapt the code:
Step 6.2: Create unique IDs for health facilities
Most DHIS2 identify health facilities by unique IDs, but sometimes this information is not extracted with other columns. Even if your data was extracted with the unique health facility ID column, the column should be checked against the master facilities list (MFL) to identify errors such as blank IDs or multiple IDs for the same facility.
If the unique ID column was not downloaded, creating an ID column is essential.In many cases, different facilities share the same name but are located in different areas. Without a unique identifier these can be mistakenly treated as one, leading to errors in analysis and miscounts in SNT outputs. Assigning a distinct ID to each facility ensures accurate tracking and prevents naming overlaps from affecting results. Note that this code does not handle the case where a health facility is listed under two different names, such as typos or transliterations. In this case, the health facility will be counted as two separate facilities. To avoid this, it is strongly recommended to obtain the MFL prior to preprocessing and to download the health facility ID when extracting from DHIS2.
To adapt the code:
Show the code
# Set the following boolean to indicate whether your dataset containes UID columns for parent admin units (above HF-level)
admin_units_uids = True
def create_hf_uid(df, admin_unit_levels):
if admin_units_uids:
cols = [f'adm{i}_uid' for i in admin_unit_levels] + ['hf']
else:
cols = [f'adm{i}' for i in admin_unit_levels] + ['hf']
# sort admin units alphabetically
df = df.sort_values(by = cols).reset_index(drop = True)
# assign a unique number to each admin unit at this level, starting at 1
df['hf_uid'] = df.groupby(cols).ngroup() + 1
# format the uid to a string with leading zeros, to ensure the number of characters in each uid is identical
df['hf_uid'] = df['hf_uid'].apply(lambda x: f'HF_{str(x).zfill(5)}')
return df
# set list of all levels above hf
admin_unit_levels = [0, 1, 2, 3]
dhis2_df = create_hf_uid(dhis2_df, admin_unit_levels)
# reorder columns for display
if admin_units_uids:
base_cols = [[f'adm{i}', f'adm{i}_uid'] for i in admin_unit_levels]
base_cols = [i for j in base_cols for i in j]
else:
base_cols = [[f'adm{i}', f'adm{i}_uid'] for i in admin_unit_levels]
base_cols = base_cols + ['hf', 'hf_uid']
base_cols = ['YM'] + base_cols
cols_data = sorted([c for c in dhis2_df.columns if c not in base_cols])
cols = base_cols + cols_data
dhis2_df = (dhis2_df[cols]
.sort_values(by = base_cols)
.reset_index(drop = True))
# Inspect results
dhis2_df.head(10).styleTo adapt the code:
Step 7: Second pass troubleshooting
Step 7.1: Highlight duplicate HF-month records with different data
Here we search for rows in the dataset that correspond to the same HF-month report but happen to have different data. If any are found, these need to be cleaned at this stage to make sure the dataset contains only one report for each combination of HF and month.
ADD APPROACH FOR CLEANING: IN STEP 7 TROUBLESHOOTING, MORE LIKELY TO NEED TO CONSULT SNT TEAM
To adapt the code:
Step 7.2: ETC OTHER PROBLEMS
Step 8: (If needed) Prepare optional ancillary datasets and versions
Step 8.1: Create and save your admin unit hierarchy
I am creating the orgunit tree below from the dataset - we could add a step before that to compare with the some reference (shapefil or HFML). I use this a lot throughout the process to simplify merge and other operations, would suggest to make it feature thourghout the SNT process but happy to work without it if the team prefers not to.
To adapt the code:
Show the code
# Set the list of admin unit levels in your dataset, above HF-level
admin_units_levels = [0, 1, 2, 3]
cols = [[f'adm{level}', f'adm{level}_uid'] for level in admin_units_levels]
cols = [i for j in cols for i in j]
cols = cols + ['hf', 'hf_uid']
dftree = (dhis2_df[cols]
.drop_duplicates()
.sort_values(by = cols)
.reset_index(drop = True))
# Save to file
# discuss with team if useful or not depending on use in other sections
# Inspect results
dftree.head(10).styleTo adapt the code:
Step 8.2: Fill in missing HF-month reports
VT:I would move this step to Step 2 I am assuming we are at the stage where we have checked for duplicates and have the following columns only: + YM (which is called periodname or pe or period in DHIS2, basically a string of character YYYYMM) + hf_uid + all the data columns + No other columns
I am also assuming the existence of dftree, which is the orgunit tree, which I simply construct from the original dataset. I use this a lot throughout the process to simplify operations, would suggest to make it feature but happy to work without it if the team prefers not to.
When examining monthly reports per health facility, you may choose to fill in gaps for health facility month records that are entirely missing. This enables identification of when a health facility became active and started reporting. The reporting rate page covers calculations to identify active and inactive facilities in more detail. Remember, data needs to be fully preprocessed prior to any reporting rate calculations.
Show the code
# # ensure date column is parsed as year-month
# ym_df <- dhis2_df |>
# dplyr::transmute(ym = lubridate::ym(date))
# # get full range of months
# ym_seq <- seq(
# min(ym_df$ym, na.rm = TRUE), max(ym_df$ym, na.rm = TRUE),
# by = "1 month") |> format("%Y-%m")
# # build base grid of all HF-months
# base <- tidyr::expand_grid(
# date = ym_seq, hf_uid = unique(dhis2_df$hf_uid)
# )
# # left join to full grid and merge admin structure
# dhis2_df_complete <- base |>
# dplyr::left_join(dhis2_df, by = c("date", "hf_uid"))
# dplyr::left_join(dftree, by = "hf_uid")To adapt the code:
Show the code
# select key columns - need to discuss how to list data columns in the simplest possible way
cols = [c for c in dhis2_df.columns if c not in ['Year', 'Month', 'hf', 'adm0', 'adm0_uid', 'adm1', 'adm1_uid', 'adm2', 'adm2_uid', 'adm3', 'adm3_uid']]
dhis2_df = (dhis2_df[cols]
.sort_values(by = ['YM', 'hf_uid'])
.reset_index(drop = True))
# keep track of current number of HF-month records
raw_length = len(dhis2_df)
# define dataset range of years
min_year = min(dhis2_df['YM'].apply(lambda x: int(x[0:4])))
max_year = max(dhis2_df['YM'].apply(lambda x: int(x[0:4])))
years = range(min_year, max_year+1)
# define base dataframe with all periods and health facilities
# periods = [str(year*100 + month) for year in years for month in range(1, 13)]
periods = [f'{year}-{month:02d}' for year in range(min_year, max_year + 1) for month in range(1, 13)]
hfs = sorted(dhis2_df['hf_uid'].unique())
cols = ['YM', 'hf_uid']
base = pd.DataFrame([(p, hf) for p in periods for hf in hfs], columns = cols)
# merge raw data to this base, to ensure all HF-month are represented, even if just with NA in data columns
dhis2_df = base.merge(dhis2_df, how = 'left', on = cols, validate = '1:1')
# reconstitute parent admin units
dhis2_df = dhis2_df.merge(dftree, on = 'hf_uid', how = 'left', validate = "m:1")
# Make Year and Month column
dhis2_df['Year'] = dhis2_df['YM'].apply(lambda x: int(x[0:4]))
dhis2_df['Month'] = dhis2_df['YM'].apply(lambda x: int(x[5:7]))
# Inform user about how many missing HF-month records were added
print(f"There were {len(dhis2_df)-raw_length} HF-month records missing")To adapt the code:
Step 8.3: IPD/OPD Specification
PLACEHOLDER STEP BASED ON JALINE AND VAL DISCUSSION 08/07
Step 9: Aggregate and save data
Step 9.1: Save data at the health facility level
Since some SNT analyses rely on facility-specific information, we will now save the data at the health facility level. Keeping the data at health facility level ensures that facility-level analyses can be done accurately and without loss of detail.
Show the code
# set up output path
save_path <- here::here("01_data",
"02_epidemiology",
"2a_routine_surveillance",
"processed")
# save to xlsx
rio::export(dhis2_df, here::here(save_path, "health_facility_data.xlsx"))
# save to csv
rio::export(dhis2_df, here::here(save_path, "health_facility_data.csv"))
# save to dta
rio::export(dhis2_df, here::here(save_path, "health_facility_data.dta"))
# save to local area
rio::export(
dhis2_df,
here::here("english/data_r/routine_cases/clean_malaria_routine_data_final.rds"))To adapt the code:
Show the code
# Set your filepath here
output_filepath = Path(here('english/data_r/routine_cases/', 'dhis2_processed_data_python'))
base_cols = ['Year', 'Month', 'YM'] + list(dftree.columns)
cols_data = sorted([c for c in dhis2_df.columns if c not in base_cols])
cols = base_cols + cols_data
dhis2_df = (dhis2_df[cols]
.sort_values(by = base_cols)
.reset_index(drop = True))
# Save to csv
dhis2_df.to_csv(f'{output_filepath}.csv', index = None)
# Save to pickle
dhis2_df.to_pickle(f'{output_filepath}.pkl')
# Inspect results
dhis2_df.head(10).styleTo adapt the code:
Step 9.2: Aggregate and save data at each admin unit level level
Now we’ll save the data at different administrative levels to support various types of analysis and ensure alignment with how interventions are typically planned and monitored. This allows flexibility when calculating indicators, comparing trends across regions, or linking with other datasets structured at adm0, adm1, adm2, or facility level. The first code chunk aggregates data at the adm2 level.
When aggregating data at to administrative levels, not all indicators can simply be summed. For example, if you have already calculated a test positivity rate or treatment rate in your dataset, these indicators must be re-calculated at the new administrative level.
Show the code
df <- dhis2_df
# --------------------------
# ADM1 level
# --------------------------
if (all(c("adm0", "adm1", "year", "month") %in% names(df))) {
# If all columns needed for ADM1 level grouping exist,
# group by adm0, adm1, year, and month, and sum numeric columns
adm1_monthly <- df |>
dplyr::group_by(adm0, adm1, year, month) |>
dplyr::summarise(
dplyr::across(dplyr::where(is.numeric), \(x) sum(x, na.rm = TRUE)),
.groups = "drop"
)
rio::export(adm1_monthly, here::here(save_path, "aggregated_data_monthly_adm1_data.xlsx"))
rio::export(adm1_monthly, here::here(save_path, "aggregated_data_monthly_adm1_data.csv"))
} else {
message("Skipping adm1: one or more required columns are missing.")
}
# --------------------------
# ADM2 level
# --------------------------
if (all(c("adm0", "adm1", "adm2", "year", "month") %in% names(df))) {
# If all columns needed for ADM2 level grouping exist,
# group by adm0, adm1, adm2, year, and month, and sum numeric columns
adm2_monthly <- df |>
dplyr::group_by(adm0, adm1, adm2, year, month) |>
dplyr::summarise(
dplyr::across(dplyr::where(is.numeric), \(x) sum(x, na.rm = TRUE)),
.groups = "drop"
)
rio::export(adm2_monthly, here::here(save_path, "aggregated_data_monthly_adm2_data.xlsx"))
rio::export(adm2_monthly, here::here(save_path, "aggregated_data_monthly_adm2_data.csv"))
} else {
message("Skipping adm2: one or more required columns are missing.")
}
# --------------------------
# ADM3 level
# --------------------------
if (all(c("adm0", "adm1", "adm2", "adm3", "year", "month") %in% names(df))) {
# If all columns needed for ADM3 level grouping exist,
# group by adm0, adm1, adm2, adm3, year, and month, and sum numeric columns
adm3_monthly <- df |>
dplyr::group_by(adm0, adm1, adm2, adm3, year, month) |>
dplyr::summarise(
dplyr::across(dplyr::where(is.numeric), \(x) sum(x, na.rm = TRUE)),
.groups = "drop"
)
rio::export(adm3_monthly, here::here(save_path, "aggregated_data_monthly_adm3_data.xlsx"))
rio::export(adm3_monthly, here::here(save_path, "aggregated_data_monthly_adm3_data.csv"))
} else {
message("Skipping adm3: one or more required columns are missing.")
}
# --------------------------
# ADM4 level
# --------------------------
if (all(c("adm0", "adm1", "adm2", "adm3", "adm4", "year", "month") %in% names(df))) {
# If all columns needed for ADM4 level grouping exist,
# group by adm0, adm1, adm2, adm3, adm4, year, and month, and sum numeric columns
adm4_monthly <- df |>
dplyr::group_by(adm0, adm1, adm2, adm3, adm4, year, month) |>
dplyr::summarise(
dplyr::across(dplyr::where(is.numeric), \(x) sum(x, na.rm = TRUE)),
.groups = "drop"
)
rio::export(adm4_monthly, here::here(save_path, "aggregated_data_monthly_adm4_data.xlsx"))
rio::export(adm4_monthly, here::here(save_path, "aggregated_data_monthly_adm4_data.csv"))
} else {
message("Skipping adm4: one or more required columns are missing.")
}To adapt the code:
Show the code
# first we define a function
def aggregate_data(df, aggregation_period, aggregation_level, output_filepath):
# Step 1: Validate input
try:
if aggregation_period not in ['Year', 'Month']:
raise ValueError("Parameter 'aggregation_period' must be either 'Year' or 'Month'.")
except ValueError as e:
print("Input Error:", e)
return None # or: return pd.DataFrame()
# Define columns used for aggregation
cols = [[f'adm{i}', f'adm{i}_uid'] for i in range(1, aggregation_level +1)]
cols = [i for j in cols for i in j]
if aggregation_period == 'Year':
cols = ['Year'] + cols
elif aggregation_period == 'Month':
cols = ['YM'] + cols
# aggregate data
aggregated_df = df.groupby(cols)[cols_data].sum(min_count = 1).reset_index()
return aggregated_dfSimilarly, you can repeat the above aggregation at the adm3-year level by:
- (in R): grouping with
dplyr::group_by(adm1, adm2, adm3, year). - (in python): Edit the
aggregation_period,aggregation_levelandoutput_filepathvariables
This allows you to generate annual summaries for each administrative unit.
Summary
We’ve now walked through the key steps for cleaning, reshaping, and aggregating DHIS2 malaria data to make it SNT-ready. The code covered everything from importing raw files, standardising column names, computing key indicators, and saving outputs at multiple administrative levels. For convenience, a full end-to-end script is included below in a folded code block. You can reuse or adapt this for your own country context by adjusting column names, file paths, and administrative levels as needed.
Full code
Find the full code script for DHIS2 data preprocessing below.
Show full code
# install or load relevant packages
pacman::p_load(
tidyverse, # core tidy tools (dplyr, tidyr, etc.)
rio, # import and export and file format
DT, # view interactive tables
here # for easy file referencing
)
#===============================================================================
# Step 1: Define Functions and Import Data
#===============================================================================
#' Reads and combines multiple Excel or CSV files from a directory into a single
#' dataframe.
#'
#' This function searches for files with specified extension in the given
#' directory and combines them into a single dataframe. It handles both Excel
#' (.xls, .xlsx) and CSV files.
#'
#' @param directory Path to directory containing files
#' (default: "english/data_r")
#' @param pattern File extension (default: "xls"). Options: "xls", "xlsx",
#' "csv" and and other formats available in rio package.
#' @return A dataframe combining all files with trimmed column names. Returns
#' NULL if no files found or if an error occurs during reading/combining.
#' @examples
#' # Read Excel files
#' df1 <- read_and_combine_files("path/to/excel", "xlsx")
#' # Read CSV files
#' df2 <- read_and_combine_files("path/to/csv", "csv")
#'
read_and_combine_files <- function(directory = "english/data_r",
pattern = "xls") {
# directory <- here::here(directory)
pattern <- paste0("\\.", pattern, "$")
# Find files matching the pattern
file_paths <- list.files(path = directory, pattern = pattern,
full.names = TRUE)
# Stop if no files found
if (length(file_paths) == 0) {
stop("No files found with specified extension in the directory.")
}
# Read all files based on extension
data_files <- lapply(file_paths, function(file) {
tryCatch(
{
rio::import(file)
},
error = function(e) {
NULL
}
)
})
# Combine all data frames
combined_df <- dplyr::bind_rows(data_files)
# Trim whitespace from column names
names(combined_df) <- trimws(names(combined_df))
return(combined_df)
}
#' Rename columns in a dataframe according to a predefined mapping
#'#' @param df A dataframe containing columns to be renamed
#'
#' @return
#' Returns the dataframe with renamed columns. Returns NULL if an error occurs.
#' @examples
#' df <- data.frame(adm0 = 1:3, allout_u5 = 4:6)
#' renamed_df <- rename_columns(df)
rename_columns <- function(df) {
# rename columns
rename_cols <- c(
# Organisational units
"adm0" = "orgunitlevel1",
"adm1" = "orgunitlevel2",
"adm2" = "orgunitlevel3",
"adm3" = "orgunitlevel4",
"hf" = "organisationunitname",
# All outpatient visits
"allout_u5" = "OPD (New and follow-up curative) 0-59m_X",
"allout_ov5" = "OPD (New and follow-up curative) 5+y_X",
# Malaria admissions
"maladm_u5" = "Admission - Child with malaria 0-59 months_X",
"maladm_5_14" = "Admission - Child with malaria 5-14 years_X",
"maladm_ov15" = "Admission - Malaria 15+ years_X",
# Malaria deaths
"maldth_1_59m" = "Child death - Malaria 1-59m_X",
"maldth_10_14" = "Child death - Malaria 10-14y_X",
"maldth_5_9" = "Child death - Malaria 5-9y_X",
"maldth_fem_ov15" = "Death malaria 15+ years Female",
"maldth_mal_ov15" = "Death malaria 15+ years Male",
"maldth_u5" = "Separation - Child with malaria 0-59 months_X Death",
"maldth_5_14" = "Separation - Child with malaria 5-14 years_X Death",
"maldth_ov15" = "Separation - Malaria 15+ years_X Death",
# Suspected cases
"susp_u5_hf" = "Fever case - suspected Malaria 0-59m_X",
"susp_5_14_hf" = "Fever case - suspected Malaria 5-14y_X",
"susp_ov15_hf" = "Fever case - suspected Malaria 15+y_X",
"susp_u5_com" = "Fever case in community (Suspected Malaria) 0-59m_X",
"susp_5_14_com" = "Fever case in community (Suspected Malaria) 5-14y_X",
"susp_ov15_com" = "Fever case in community (Suspected Malaria) 15+y_X",
# Malaria tests conducted at community level
"tes_neg_rdt_u5_com" =
"Fever case in community tested for Malaria (RDT) - Negative 0-59m_X",
"tes_pos_rdt_u5_com" =
"Fever case in community tested for Malaria (RDT) - Positive 0-59m_X",
"tes_neg_rdt_5_14_com" =
"Fever case in community tested for Malaria (RDT) - Negative 5-14y_X",
"tes_pos_rdt_5_14_com" =
"Fever case in community tested for Malaria (RDT) - Positive 5-14y_X",
"tes_neg_rdt_ov15_com" =
"Fever case in community tested for Malaria (RDT) - Negative 15+y_X",
"tes_pos_rdt_ov15_com" =
"Fever case in community tested for Malaria (RDT) - Positive 15+y_X",
# Malaria tests conducted at health facility level
"test_neg_mic_u5_hf" =
"Fever case tested for Malaria (Microscopy) - Negative 0-59m_X",
"test_pos_mic_u5_hf" =
"Fever case tested for Malaria (Microscopy) - Positive 0-59m_X",
"test_neg_mic_5_14_hf" =
"Fever case tested for Malaria (Microscopy) - Negative 5-14y_X",
"test_pos_mic_5_14_hf" =
"Fever case tested for Malaria (Microscopy) - Positive 5-14y_X",
"test_neg_mic_ov15_hf" =
"Fever case tested for Malaria (Microscopy) - Negative 15+y_X",
"test_pos_mic_ov15_hf" =
"Fever case tested for Malaria (Microscopy) - Positive 15+y_X",
"tes_neg_rdt_u5_hf" =
"Fever case tested for Malaria (RDT) - Negative 0-59m_X",
"tes_pos_rdt_u5_hf" =
"Fever case tested for Malaria (RDT) - Positive 0-59m_X",
"tes_neg_rdt_5_14_hf" =
"Fever case tested for Malaria (RDT) - Negative 5-14y_X",
"tes_pos_rdt_5_14_hf" =
"Fever case tested for Malaria (RDT) - Positive 5-14y_X",
"tes_neg_rdt_ov15_hf" =
"Fever case tested for Malaria (RDT) - Negative 15+y_X",
"tes_pos_rdt_ov15_hf" =
"Fever case tested for Malaria (RDT) - Positive 15+y_X",
# Malaria tests conducted at community level
"maltreat_u24_u5_com" =
"Malaria treated in community with ACT <24 hours 0-59m_X",
"maltreat_ov24_u5_com" =
"Malaria treated in community with ACT >24 hours 0-59m_X",
"maltreat_u24_5_14_com" =
"Malaria treated in community with ACT <24 hours 5-14y_X",
"maltreat_ov24_5_14_com" =
"Malaria treated in community with ACT >24 hours 5-14y_X",
"maltreat_u24_ov15_com" =
"Malaria treated in community with ACT <24 hours 15+y_X",
"maltreat_ov24_ov15_com" =
"Malaria treated in community with ACT >24 hours 15+y_X",
# Malaria tests conducted at community level
"maltreat_u24_u5_hf" =
"Malaria treated with ACT <24 hours 0-59m_X",
"maltreat_ov24_u5_hf" =
"Malaria treated with ACT >24 hours 0-59m_X",
"maltreat_u24_5_14_hf" =
"Malaria treated with ACT <24 hours 5-14y_X",
"maltreat_ov24_5_14_hf" =
"Malaria treated with ACT >24 hours 5-14y_X",
"maltreat_u24_ov15_hf" =
"Malaria treated with ACT <24 hours 15+y_X",
"maltreat_ov24_ov15_hf" =
"Malaria treated with ACT >24 hours 15+y_X"
)
df <- df |>
dplyr::rename(!!!rename_cols)
return(df)
}
# import dhis2 data and clean columns using helper function
dhis2_df <- read_and_combine_files(
here::here("english/data_r")
) |>
rename_columns() # clean names
# check that the renaming of columns worked
colnames(dhis2_df) |> print()
#===============================================================================
# Step 2: Compute New Variables
#===============================================================================
# Calculate totals
dhis2_df <- dhis2_df |>
dplyr::mutate(
# outpatient visits
allout = rowSums(dplyr::across(c(allout_u5, allout_ov5)), na.rm = TRUE),
# suspected cases
susp = rowSums(dplyr::across(c(
susp_u5_hf, susp_5_14_hf, susp_ov15_hf,
susp_u5_com, susp_5_14_com, susp_ov15_com
)), na.rm = TRUE),
# tested cases
test_hf = rowSums(dplyr::across(c(
test_neg_mic_u5_hf, test_pos_mic_u5_hf, test_neg_mic_5_14_hf,
test_pos_mic_5_14_hf, test_neg_mic_ov15_hf, test_pos_mic_ov15_hf,
tes_neg_rdt_u5_hf, tes_pos_rdt_u5_hf, tes_neg_rdt_5_14_hf,
tes_pos_rdt_5_14_hf, tes_neg_rdt_ov15_hf, tes_pos_rdt_ov15_hf
)), na.rm = TRUE),
test_com = rowSums(dplyr::across(c(
tes_neg_rdt_u5_com, tes_pos_rdt_u5_com, tes_neg_rdt_5_14_com,
tes_pos_rdt_5_14_com, tes_neg_rdt_ov15_com, tes_pos_rdt_ov15_com
)), na.rm = TRUE),
test = test_hf + test_com,
# confirmed cases (HF and COM)
conf_hf = rowSums(dplyr::across(c(
test_pos_mic_u5_hf, test_pos_mic_5_14_hf, test_pos_mic_ov15_hf,
tes_pos_rdt_u5_hf, tes_pos_rdt_5_14_hf, tes_pos_rdt_ov15_hf
)), na.rm = TRUE),
conf_com = rowSums(dplyr::across(c(
tes_pos_rdt_u5_com, tes_pos_rdt_5_14_com, tes_pos_rdt_ov15_com
)), na.rm = TRUE),
conf = conf_hf + conf_com,
# treated cases
maltreat_com = rowSums(dplyr::across(c(
maltreat_u24_u5_com, maltreat_ov24_u5_com, maltreat_u24_5_14_com,
maltreat_ov24_5_14_com, maltreat_u24_ov15_com, maltreat_ov24_ov15_com
)), na.rm = TRUE),
maltreat_hf = rowSums(dplyr::across(c(
maltreat_u24_u5_hf, maltreat_ov24_u5_hf, maltreat_u24_5_14_hf,
maltreat_ov24_5_14_hf, maltreat_u24_ov15_hf, maltreat_ov24_ov15_hf
)), na.rm = TRUE),
maltreat = maltreat_hf + maltreat_com,
# presumed cases
pres_com = pmax(maltreat_com - conf_com, 0, na.rm = TRUE),
pres_hf = pmax(maltreat_hf - conf_hf, 0, na.rm = TRUE),
pres = pres_com + pres_hf,
# malaria admissions
maladm = rowSums(dplyr::across(
c(maladm_u5, maladm_5_14, maladm_ov15)), na.rm = TRUE),
# malaria deaths
maldth = rowSums(dplyr::across(c(
maldth_u5, maldth_1_59m, maldth_10_14, maldth_5_9,
maldth_5_14, maldth_ov15, maldth_fem_ov15, maldth_mal_ov15
)), na.rm = TRUE),
# AGE-GROUP SPECIFIC AGGREGATIONS
# Tested cases by age group (HF only)
test_hf_u5 = rowSums(dplyr::across(c(
test_neg_mic_u5_hf, test_pos_mic_u5_hf,
tes_neg_rdt_u5_hf, tes_pos_rdt_u5_hf
)), na.rm = TRUE),
test_hf_5_14 = rowSums(dplyr::across(c(
test_neg_mic_5_14_hf, test_pos_mic_5_14_hf,
tes_neg_rdt_5_14_hf, tes_pos_rdt_5_14_hf
)), na.rm = TRUE),
test_hf_ov15 = rowSums(dplyr::across(c(
test_neg_mic_ov15_hf, test_pos_mic_ov15_hf,
tes_neg_rdt_ov15_hf, tes_pos_rdt_ov15_hf
)), na.rm = TRUE),
# Tested cases by age group (Community only)
test_com_u5 = rowSums(dplyr::across(c(
tes_neg_rdt_u5_com, tes_pos_rdt_u5_com
)), na.rm = TRUE),
test_com_5_14 = rowSums(dplyr::across(c(
tes_neg_rdt_5_14_com, tes_pos_rdt_5_14_com
)), na.rm = TRUE),
test_com_ov15 = rowSums(dplyr::across(c(
tes_neg_rdt_ov15_com, tes_pos_rdt_ov15_com
)), na.rm = TRUE),
# Total tested by age group (HF + Community)
test_u5 = test_hf_u5 + test_com_u5,
test_5_14 = test_hf_5_14 + test_com_5_14,
test_ov15 = test_hf_ov15 + test_com_ov15,
# Suspected cases by age group (HF only)
susp_hf_u5 = rowSums(dplyr::across(c(
susp_u5_hf
)), na.rm = TRUE),
susp_hf_5_14 = rowSums(dplyr::across(c(
susp_5_14_hf
)), na.rm = TRUE),
susp_hf_ov15 = rowSums(dplyr::across(c(
susp_ov15_hf
)), na.rm = TRUE),
# Suspected cases by age group (Community only)
susp_com_u5 = rowSums(dplyr::across(c(
susp_u5_com
)), na.rm = TRUE),
susp_com_5_14 = rowSums(dplyr::across(c(
susp_5_14_com
)), na.rm = TRUE),
susp_com_ov15 = rowSums(dplyr::across(c(
susp_ov15_com
)), na.rm = TRUE),
# Total suspected by age group (HF + Community)
susp_u5 = susp_hf_u5 + susp_com_u5,
susp_5_14 = susp_hf_5_14 + susp_com_5_14,
susp_ov15 = susp_hf_ov15 + susp_com_ov15,
# Confirmed cases by age group (HF only)
conf_hf_u5 = rowSums(dplyr::across(c(
test_pos_mic_u5_hf, tes_pos_rdt_u5_hf
)), na.rm = TRUE),
conf_hf_5_14 = rowSums(dplyr::across(c(
test_pos_mic_5_14_hf, tes_pos_rdt_5_14_hf
)), na.rm = TRUE),
conf_hf_ov15 = rowSums(dplyr::across(c(
test_pos_mic_ov15_hf, tes_pos_rdt_ov15_hf
)), na.rm = TRUE),
# Confirmed cases by age group (Community only)
conf_com_u5 = tes_pos_rdt_u5_com,
conf_com_5_14 = tes_pos_rdt_5_14_com,
conf_com_ov15 = tes_pos_rdt_ov15_com,
# Total confirmed by age group (HF + Community)
conf_u5 = conf_hf_u5 + conf_com_u5,
conf_5_14 = conf_hf_5_14 + conf_com_5_14,
conf_ov15 = conf_hf_ov15 + conf_com_ov15,
# Treated cases by age group (HF only)
maltreat_hf_u5 = rowSums(dplyr::across(c(
maltreat_u24_u5_hf, maltreat_ov24_u5_hf
)), na.rm = TRUE),
maltreat_hf_5_14 = rowSums(dplyr::across(c(
maltreat_u24_5_14_hf, maltreat_ov24_5_14_hf
)), na.rm = TRUE),
maltreat_hf_ov15 = rowSums(dplyr::across(c(
maltreat_u24_ov15_hf, maltreat_ov24_ov15_hf
)), na.rm = TRUE),
# Treated cases by age group (Community only)
maltreat_com_u5 = rowSums(dplyr::across(c(
maltreat_u24_u5_com, maltreat_ov24_u5_com
)), na.rm = TRUE),
maltreat_com_5_14 = rowSums(dplyr::across(c(
maltreat_u24_5_14_com, maltreat_ov24_5_14_com
)), na.rm = TRUE),
maltreat_com_ov15 = rowSums(dplyr::across(c(
maltreat_u24_ov15_com, maltreat_ov24_ov15_com
)), na.rm = TRUE),
# Total treated by age group (HF + Community)
maltreat_u5 = maltreat_hf_u5 + maltreat_com_u5,
maltreat_5_14 = maltreat_hf_5_14 + maltreat_com_5_14,
maltreat_ov15 = maltreat_hf_ov15 + maltreat_com_ov15,
# Total treated within 24 hours (HF only)
maltreat_u24_hf = rowSums(dplyr::across(c(
maltreat_u24_u5_hf, maltreat_u24_5_14_hf, maltreat_u24_ov15_hf
)), na.rm = TRUE),
# Total treated after 24 hours (HF only)
maltreat_ov24_hf = rowSums(dplyr::across(c(
maltreat_ov24_u5_hf, maltreat_ov24_5_14_hf, maltreat_ov24_ov15_hf
)), na.rm = TRUE),
# Total treated within 24 hours (Community only)
maltreat_u24_com = rowSums(dplyr::across(c(
maltreat_u24_u5_com, maltreat_u24_5_14_com, maltreat_u24_ov15_com
)), na.rm = TRUE),
# Total treated after 24 hours (Community only)
maltreat_ov24_com = rowSums(dplyr::across(c(
maltreat_ov24_u5_com, maltreat_ov24_5_14_com, maltreat_ov24_ov15_com
)), na.rm = TRUE),
# Overall totals (HF + Community)
maltreat_u24_total = maltreat_u24_hf + maltreat_u24_com,
maltreat_ov24_total = maltreat_ov24_hf + maltreat_ov24_com,
# Presumed cases by age group
pres_com_u5 = pmax(maltreat_com_u5 - conf_com_u5, 0, na.rm = TRUE),
pres_com_5_14 = pmax(maltreat_com_5_14 - conf_com_5_14, 0, na.rm = TRUE),
pres_com_ov15 = pmax(maltreat_com_ov15 - conf_com_ov15, 0, na.rm = TRUE),
pres_hf_u5 = pmax(maltreat_hf_u5 - conf_hf_u5, 0, na.rm = TRUE),
pres_hf_5_14 = pmax(maltreat_hf_5_14 - conf_hf_5_14, 0, na.rm = TRUE),
pres_hf_ov15 = pmax(maltreat_hf_ov15 - conf_hf_ov15, 0, na.rm = TRUE),
pres_u5 = pres_com_u5 + pres_hf_u5,
pres_5_14 = pres_com_5_14 + pres_hf_5_14,
pres_ov15 = pres_com_ov15 + pres_hf_ov15
)
# Check totals
dhis2_df |>
filter(
# Outpatient checks
allout != allout_u5 + allout_ov5 |
# Malaria admissions check
maladm != maladm_u5 + maladm_5_14 + maladm_ov15 |
# Total tests check
test != test_hf + test_com |
# Total confirmed check
conf != conf_hf + conf_com |
# Total treated check
maltreat != maltreat_hf + maltreat_com |
# Total presumed check
pres != pres_hf + pres_com |
# Tested cases checks
test_u5 != test_hf_u5 + test_com_u5 |
test_5_14 != test_hf_5_14 + test_com_5_14 |
test_ov15 != test_hf_ov15 + test_com_ov15 |
# Confirmed cases checks
conf_u5 != conf_hf_u5 + conf_com_u5 |
conf_5_14 != conf_hf_5_14 + conf_com_5_14 |
conf_ov15 != conf_hf_ov15 + conf_com_ov15 |
# Presumed cases checks
pres_u5 != pres_hf_u5 + pres_com_u5 |
pres_5_14 != pres_hf_5_14 + pres_com_5_14 |
pres_ov15 != pres_hf_ov15 + pres_com_ov15|
# Suspected cases checks
susp_u5 != susp_u5_hf + susp_u5_com |
susp_5_14 != susp_5_14_hf + susp_5_14_com |
susp_ov15 != susp_ov15_hf + susp_ov15_com|
# Treated cases checks
maltreat_u5 != maltreat_hf_u5 + maltreat_com_u5 |
maltreat_5_14 != maltreat_hf_5_14 + maltreat_com_5_14 |
maltreat_ov15 != maltreat_hf_ov15 + maltreat_com_ov15 |
# Treatment timing checks
maltreat_u24_total != maltreat_u24_hf + maltreat_u24_com |
maltreat_ov24_total != maltreat_ov24_hf + maltreat_ov24_com
) |>
select(
hf, periodname, allout,
maldth, maladm, test,
conf, maltreat, pres,
test_u5, test_5_14, test_ov15,
conf_u5, conf_5_14, conf_ov15,
pres_u5, pres_5_14, pres_ov15,
maltreat_u5, maltreat_5_14, maltreat_ov15,
maltreat_u24_total, maltreat_ov24_total
)
#===============================================================================
# Step 3: Fix Year and Month Columns
#===============================================================================
dhis2_df <- dhis2_df |>
# seperate periodname to month and year
tidyr::separate(
col = tidyselect::all_of("periodname"),
into = c("month", "year"),
sep = " ",
remove = TRUE
) |>
# recode month
dplyr::mutate(
month = recode(
month,
"January" = "01",
"February" = "02",
"March" = "03",
"April" = "04",
"May" = "05",
"June" = "06",
"July" = "07",
"August" = "08",
"September" = "09",
"October" = "10",
"November" = "11",
"December" = "12"
)
) |>
# create a new date column
unite("date", year, month, sep = "-", remove = FALSE)
#===============================================================================
# Step 4: Create a Unique `hf` Id
#===============================================================================
dhis2_df <- dhis2_df |>
dplyr::group_by(adm1, adm2, adm3, hf) |>
dplyr::mutate(hf_uid = sprintf("hf_%04d", dplyr::cur_group_id())) |>
dplyr::ungroup()
# check it worked
dhis2_df |>
dplyr::distinct(adm2, adm3, hf, hf_uid) |>
tail()
#===============================================================================
# Step 5: Save Health Facility Data
#===============================================================================
# set up output path
save_path <- here::here("01_data",
"02_epidemiology",
"2a_routine_surveillance",
"processed")
# save to xlsx
rio::export(dhis2_df, here::here(save_path, "health_facility_data.xlsx"))
# save to csv
rio::export(dhis2_df, here::here(save_path, "health_facility_data.csv"))
# save to dta
rio::export(dhis2_df, here::here(save_path, "health_facility_data.dta"))
##==============================================================================
# Step 6: Group and aggregate monthly data at the adm1 level
#===============================================================================
adm2_monthly_data <- dhis2_df |>
dplyr::group_by(adm2, adm1, year, month, date) |>
dplyr::summarise(
dplyr::across(
dplyr::where(
is.numeric), \(x) sum(x, na.rm = TRUE)),
.groups = "drop")
# save to xlsx
rio::export(adm2_monthly_data,
here::here(save_path, "aggregated_data_monthly_adm2_data.xlsx"))
# save to csv
rio::export(adm2_monthly_data,
here::here(save_path, "aggregated_data_monthly_adm2_data.csv"))
# save to dta
rio::export(adm2_monthly_data,
here::here(save_path, "aggregated_data_monthly_adm2_data.dta"))
adm1_monthly_data <- dhis2_df |>
dplyr::group_by(adm1, year, month, date) |>
dplyr::summarise(
dplyr::across(
dplyr::where(
is.numeric), \(x) sum(x, na.rm = TRUE)),
.groups = "drop")
# save to xlsx
rio::export(adm1_monthly_data,
here::here(save_path, "haggregated_data_monthly_adm1_data.xlsx"))
# save to csv
rio::export(adm1_monthly_data,
here::here(save_path, "aggregated_data_monthly_adm1_data.csv"))
# save to dta
rio::export(adm1_monthly_data,
here::here(save_path, "aggregated_data_monthly_adm1_data.dta"))
#===============================================================================
# End of Script
#===============================================================================