# Install pacman only if it's not already installed
if (!requireNamespace("pacman", quietly = TRUE)) {
install.packages("pacman")
}
# install or load relevant packages
pacman::p_load(
readxl, # import and read Excel files
ggplot2, # plotting
writexl, # export Excel files
tidyr, # data management
dplyr, # data manipulation
forcats, # categorical data management
viridis, # colourblind-friendly palettes
lubridate # date management
)Data coherency checks
Beginner
Overview
In the context of SNT, data coherency checks on routine surveillance data data are a required step to evaluate data quality and explore the appropriateness of the database for decision-making. Data coherency checks should be performed for different combinations of variables relevant for SNT (epidemiological stratifications, quality of uncomplicated or severe malaria care, quality of the surveillance system, etc.).
- Understand the logic and theory behind coherency checks
- Understand best practices for coherency checks
- Generate scatterplots to evaluate data coherency
- Examine one-to-one variable relationships visually
- Identify for follow-up which data points (health facility-months) are failing the coherency check
Key Concepts in Data Coherency Checks
Understanding Data Coherency Checks
A coherency test checks whether related variables follow logically consistent relationships (e.g., whole ≥ part). It identifies data errors or reporting inconsistencies by flagging violations of expected relationships. A violation of the expected relationship could be due to data quality issues, or could reflect other challenges related to quality of care, stock availability, etc. The SNT team may choose to use results of the data coherency checks to flag facilities with systematic or consistent violations for follow-up by district teams.
Standard coherency checks of the most disaggregated data often performed during SNT are listed below. Note that some checks may not be possible if a country’s routine system is not collecting certain data elements, whereas other checks not listed may also be of interest. We encourage analysts to consider what additional coherency checks may also help understand data quality and health facility operational challenges and therefore should be included in the SNT exercise.
Asking the SNT team for the NMP’s malaria case management protocols may be useful prior to this exercise to identify the main steps throughout the clinical management process when relevant incoherences need to be checked. The referral protocols from peripheral health facilities or community health workers to their assigned referral health centers or hospitals should also be understood to contextualize some of the incoherences that may arise from the data.
All-cause outpatient visits ≥ suspected malaria cases: At any given point in time at a health facility, the number of all-cause outpatient visits should always be higher than any other variable reported at that time from that health facility, as the number of outpatient consultations is always the starting point of a clinical process. Having a lower number of all-cause outpatients compared to suspected malaria cases is generally indicative of low quality of non-malaria specific information.
Suspected malaria cases ≥ tested malaria cases: The general recommendation for malaria case management is that all outpatients with a fever (check suspected case definition in the national malaria case management guidelines) need to be tested for malaria. Therefore, ideally, the number of suspected malaria cases should be equal to the number of tested malaria cases.
When tested cases are much lower than suspected cases, this may reflect data quality issues with either variable, stockouts on tests, or low testing rates. To explore this further, analysts can review the trends in RDT vs microscopy testing in the HFs with consistent incoherences, in an attempt to understand if the lack of testing is coming from a specific diagnostic at a given point in time.
When tested cases are higher than suspected cases, this may also reflect data quality issues. However, disparities do not always indicate errors, as proactive community testing or asymptomatic screening can legitimately increase tested cases beyond suspected counts. Whether this is a plausible explanation also depends on how the country captures data on outreach or community asymptomatic testing, which will need to be explored with the SNT team.
All-cause outpatient visits ≥ tested malaria cases: This check may be of interest if the NMP suggests that the information contained in the “suspected malaria” column is not reliable and the previous check cannot be done.
Tested malaria cases ≥ confirmed malaria cases: Points represent facility-level test positivity rates (TPRs): TPR = confirmed malaria cases divided by tested malaria cases. The expectation is that there are nearly no situations where test positivity rates will be 100%. Furthermore, it is not possible to have a higher number of confirmed cases compared to tested (TPR > 100%), except for very restricted circumstances such as for referral health centers for other health facilities or community health workers (to be discussed with the SNT team). If confirmed cases are higher than tested cases, there is likely a data quality issue with either data element or both.
Confirmed malaria cases ≥ treated malaria cases: Ideally, all confirmed cases should be treated with the age-appropriate therapeutic dose of the first-line treatments in the country. Therefore, the number of confirmed malaria cases should be the same as the number of treated malaria cases. Low treatment rates are observed when confirmed cases are higher than the treated cases, which could be due to stockouts of ACTs, or data quality issues. Having lower confirmed cases than treated cases may indicate occurrence of presumptive treatments, medication leaving the facility to be sold in the private sector, or data quality issues. Checking stock data from individual facilities with consistent high or low treatment rates may be helpful.
All-cause hospital admissions ≥ malaria admissions: There cannot be more inpatients due to malaria than the total number of inpatients. When all-cause admissions are lower than malaria admissions, this reflects data quality issues with inadequate reporting of either of the variables.
All-cause deaths ≥ malaria deaths: A 1:1 relationship implies that all reported deaths are malaria-attributed. When all-cause deaths are lower than the malaria deaths, this reflects data quality issues with inadequate reporting of either or both of the variables.
Malaria admissions ≥ malaria deaths: A 1:1 relationship implies that all hospitalized malaria cases died, or 100% hospital mortality ratios. When malaria admissions are lower than malaria deaths, this reflects data quality issues with inadequate reporting of either or both of these variables.
Note: Some countries use health facility death registries as the platform where community deaths are reported. When this is the case, the number of health facility admissions should not be compared to the deaths, as the source of information is different. Analysts should confirm this with the NMPs and attempt to differentiate between the deaths that occured in the hospital and those that took place in the community.
Consult the SNT team to validate which coherency checks are relevant and to ensure data elements are correctly used. For example, changes in definition of a data element or recommended practice may make some checks irrelevant for certain years in the dataset.
Best Practices in Data Coherency Checks
Data coherency checks need to be performed on the most granular data possible, for spatially and temporally, and ideally at the unit at which data is reported: most ofthen this is the health facility-month. This is because incoherency begins at the data entry level. As data are aggregated, incoherences are obscured.
Data coherency checks should be conducted at the subnational level: regional level for smaller countries with a low number of districts per region and HFs per district; or district level for large countries with districts with many health facilities. This means ensuring that data are disaggregated by region (or district), then conducting all checks separately for each subnational unit. Conducting the checks at the subnational level allows us to identify the regions (or districts) where data are of lower quality for decision-making.
All data issues identified during the SNT process, including those found during the data coherency checks, need to be documented thoroughly so that they can be shared and explored by the NMP and HMIS colleagues, and potentially corrected at the source. Documentation means not just relying on visualizations, but also generating tables (Excel files or similar) where incoherencies can be individually identified. Sometimes, corrections may not be possible in the HMIS’s DHIS2 instances given their data reviewing and closure protocols. However, countries that have national malaria data repositories (NMDRs) should ensure that their NMDR platform allows the storage of changes that need to be applied retrospectively to the data automatically available from DHIS2.
As data incoherences are reviewed and discussed, analysts should take notes of the main challenges reported by the SNT team on surveillance quality or case management issues, so that this can be explored and presented in the appropriate stratification sections.
Step-by-Step
Below is a step-by-step walkthrough of generating plots that visualize one-to-one variable relationships. Each step includes notes to help you understand, adapt, and use the code.
To skip the step-by-step explanation, jump to the full code at the end of this page.
Step 1: Load libraries and read in data
Step 1.1: Install and load libararies
The first step is to install and load the libraries required for this section. Next, we read in the data needed to generate scatterplots.
Install packages in your terminal, if not already installed. If you need help installing packages, please refer to the Getting Started page.
Once packages are installed, load the relevant packages.
# load relevant packages
import pandas as pd # data manipulation and Excel
import matplotlib.pyplot as plt # plotting
import seaborn as sns # plotting, color palette
import numpy as np # numerics
import os # file paths
import pyreadr # reading rds files
from datetime import datetime # date management
from matplotlib.lines import Line2D # plotting
from pyhere import here # file pathsStep 1.2: Load data
Load the data to perform coherency checks on. The example code continues the use of the preprocessed health facility-level Sierra Leone DHIS2 data.
Warning: Missing `trust` will be set to FALSE by default for RDS in 2.0.0.
To adapt the code:
- Line 4: Change the path
1.2_epidemiology/1.2a_routine_surveillance/rawto match thefolder/subfolderwhere your clean DHIS2 epidemiology data are saved - Line 5: Update the filename
clean_malaria_routine_data_final.rdsto match the filename of your clean DHIS2 epidemiology data
To adapt the code:
- Line 2: Change the path
"1.2_epidemiology", "1.2a_routine_surveillance", "raw"to match the"folder","subfolder"where your clean DHIS2 epidemiology data are saved - Line 3: Update the filename
clean_malaria_routine_data_final.rdsto match the filename of your clean DHIS2 epidemiology data
Step 2: Generate basic scatterplot
Next we generate a scatterplot to visualize one-to-one variable relationships. This example examines the overall relationship between confirmed cases (conf) and treated cases (maltreat).
Show full code
confmaltreat <- clean_malaria_routine_data_final |>
ggplot2::ggplot(ggplot2::aes(x = maltreat, y = conf)) +
ggplot2::geom_point(
ggplot2::aes(color = ifelse(maltreat > conf, "red", "#1e81b0")),
alpha = 0.6,
size = 3
) +
ggplot2::scale_color_identity(
name = "",
breaks = c("red", "#1e81b0"),
labels = c("Treated > Confirmed", "Confirmed ≥ Treated"),
guide = "legend"
) +
ggplot2::geom_abline(
slope = 1,
intercept = 0,
color = "black",
linetype = "dashed"
) +
ggplot2::labs(
title = "Confirmed vs. Treated Cases by Health Facility",
x = "Treated Cases",
y = "Confirmed Cases"
) +
ggplot2::theme_minimal() +
ggplot2::theme(
plot.title = ggplot2::element_text(size = 24,
face = "bold",
hjust = 0.5),
axis.title.x = ggplot2::element_text(size = 22),
axis.title.y = ggplot2::element_text(size = 22),
legend.text = ggplot2::element_text(size = 20),
legend.position = "right",
axis.text.x = ggplot2::element_text(angle = 90,
vjust = 0.5,
hjust = 1,
size = 18),
axis.text.y = ggplot2::element_text(size = 18)
) +
ggplot2::coord_equal() +
ggplot2::xlim(
0,
max(clean_malaria_routine_data_final$maltreat,
clean_malaria_routine_data_final$conf,
na.rm = TRUE)
) +
ggplot2::ylim(
0,
max(clean_malaria_routine_data_final$maltreat,
clean_malaria_routine_data_final$conf,
na.rm = TRUE)
)
# Save plot
ggplot2::ggsave(
"03_outputs/coherency_checks/confmaltreat_coherency_check.png",
plot = confmaltreat,
width = 12,
height = 8,
units = "in",
dpi = 300
)To adapt the code:
- Line 1: Change
confmaltreatandclean_malaria_routine_data_finalto reflect the variables and data frame being plotted - Line 2: Replace
maltreatandconfin with the x and y-variables you wish to plot - Line 4: Replace
maltreat > confwith the variables of the coherency check you wish to plot - Line 11: Modify the legend labels based on the variables being plotted
- Line 21-23: Modify the plot and axes title based on the variables being plotted
- Line 43-50: Replace
clean_malaria_routine_data_final,conf, andmaltreatto use the limits of the variables from the data you are using - Line 48-49: Replace
confmaltreat_coherency_check.pngwith the desired file name and replaceconfmaltreatwith the plot name defined in Line 1
Show full code
plt.style.use('bmh') # similar to theme_minimal() in R
plt.figure(figsize=(12, 12))
x = clean_malaria_routine_data_final["maltreat"]
y = clean_malaria_routine_data_final["conf"]
colors = np.where(x > y, "red", "#1e81b0")
plt.scatter(x, y, c=colors, alpha=0.6, s=70, label=None)
# Line: y=x
lims = [0, np.nanmax([x.max(), y.max()])]
plt.plot(lims, lims, 'k--', lw=1)
plt.grid(True, which='major', color='lightgray', linestyle='-', linewidth=1)
plt.gca().set_facecolor('white')
# Legends
from matplotlib.lines import Line2D
legend_elements = [
Line2D([0], [0], marker='o', color='w', label='Treated > Confirmed',
markerfacecolor='red', markersize=10),
Line2D([0], [0], marker='o', color='w', label='Confirmed ≥ Treated',
markerfacecolor='#1e81b0', markersize=10)
]
plt.legend(handles=legend_elements, loc='center right', title='', fontsize=22,facecolor='white')
plt.title("Confirmed vs. Treated Cases by Health Facility", fontsize=24, weight='bold')
plt.xlabel("Treated Cases", fontsize=22, weight='semibold')
plt.ylabel("Confirmed Cases", fontsize=22,weight='semibold')
plt.xticks(rotation=0, fontsize=18)
plt.yticks(fontsize=18)
plt.xlim(lims)
plt.ylim(lims)
plt.gca().set_aspect('equal', adjustable='box')
plt.tight_layout()
# Save plot
outfolder = os.path.join("03_outputs", "coherency_checks")
os.makedirs(outfolder, exist_ok=True)
plt.savefig(os.path.join(outfolder, "confmaltreat_coherency_check.png"), dpi=300)
plt.close()To adapt the code:
- Line 4-5: Change
clean_malaria_routine_data_finalto reflect the data frame being plotted - Line 4-5: Replace
maltreatandconfin with the x and y-variables you wish to plot - Line 7: Replace
x > ywith the variables of the coherency check you wish to plot - Line 18: Modify the legend labels based on the variables being plotted
- Line 26-28: Modify the plot and axes title based on the variables being plotted
- Line 39: Replace
confmaltreat_coherency_check.pngwith the desired file name
Step 3: Generate subsetted scatterplots
Step 3.1: Scatterplots by year
This example subsets the plots by year, which also allows for comparison across time and identification of temporal patterns. The plots are arranged into a grid and then exported.
This code follows the same logic as Step 2, but here we stratify by year using ggplot2::facet_wrap(~ year).
Show full code
confmaltreat_grid_year <- clean_malaria_routine_data_final |>
dplyr::group_by(year) |>
dplyr::mutate(
facet_max = max(c(maltreat, conf), na.rm = TRUE) # Calculate per-facet max values
) |>
dplyr::ungroup() |>
ggplot2::ggplot(
ggplot2::aes(x = maltreat, y = conf)) +
ggplot2::geom_point(
ggplot2::aes(
color = ifelse(maltreat > conf, "red", "#1e81b0")),
alpha = 0.6, size = 1.5
) +
ggplot2::scale_color_identity(
name = "",
breaks = c("red", "#1e81b0"),
labels = c("Treated > Confirmed", "Confirmed ≥ Treated"),
guide = "legend"
) +
ggplot2::geom_abline(
slope = 1,
intercept = 0,
color = "black",
linetype = "dashed"
) +
ggplot2::facet_wrap(~ year,
ncol = 3,
scales = "free") +
ggplot2::labs(
title = "Confirmed vs. Treated Cases by Year",
x = "Treated Cases",
y = "Confirmed Cases"
) +
ggplot2::theme_minimal() +
ggplot2::theme(
plot.title = ggplot2::element_text(size = 22,
face = "bold",
hjust = 0.5),
axis.title.x = ggplot2::element_text(size = 18),
axis.title.y = ggplot2::element_text(size = 18),
legend.text = ggplot2::element_text(size = 16),
strip.text = ggplot2::element_text(size = 14,
face = "bold"),
legend.position = "right",
axis.text.x = ggplot2::element_text(angle = 90,
vjust = 0.5,
hjust = 1,
size = 12),
axis.text.y = ggplot2::element_text(size = 12)
) +
ggplot2::geom_blank(ggplot2::aes(x = facet_max,
y = facet_max)) +
ggplot2::geom_blank(ggplot2::aes(x = 0, y = 0))
# Calculate dynamic height based on years of data
n_groups <- length(unique(clean_malaria_routine_data_final$year))
plot_height <- 4 * ceiling(n_groups / 3) # 4 inches per row
# Save plot
ggplot2::ggsave(
"03_outputs/coherency_checks/confmaltreat_grid_year.png",
plot = confmaltreat_grid_year,
width = 12,
height = plot_height,
units = "in",
dpi = 300
)To adapt the code:
- Line 1: Change
confmaltreat_grid_yearandclean_malaria_routine_data_finalto reflect the variables and data frame you are plotting - Line 2: Replace
yearwith the variable you wish to stratify plots - Line 4: Replace
confandmaltreatwith the variables of the coherency check you wish to plot - Line 8: Replace
confandmaltreatwith the variables of the coherency check you wish to plot - Line 11: Replace
maltreat > confwith the variables and coherency check you wish to plot. Pay attention to the inequality sign based on the check you are performing! - Line 17: Modify the legend labels based on the variables being plotted
- Line 26: Replace
yearwith the variable you wish to stratify plots - Line 30-32: Modify the plot and axes titles based on the variables being plotted
- Line 56: Replace
clean_malaria_routine_data_final$yearwith thedataframe$stratifingvariableyour plot uses - Line 61: Replace
confmaltreat_grid_year.pngwith the desired file name - Line 62: Replace
confmaltreat_grid_yearwith the plot name defined in Line 1
This code follows the same logic as Step 2, but here we stratify by year using seaborn.FacetGrid.
Show full code
plt.style.use('bmh')
grouped = clean_malaria_routine_data_final.copy()
n_years = grouped['year'].nunique()
plot_height = 4.5 * int(np.ceil(n_years / 3)) # 4.5 inches/row, 3 columns
column_width = 5.5 # 5.5 inches/column
plot_width = column_width * 3 # Total width for 3 columns
def add_diag(data, **kwargs):
facet_max = np.nanmax([data["maltreat"].max(), data["conf"].max()])
plt.plot([0, facet_max], [0, facet_max], 'k--', lw=1)
plt.xlim(0, facet_max);
plt.ylim(0, facet_max);
g = sns.FacetGrid(
grouped, col="year", col_wrap=3, height=4, sharex=False, sharey=False
)
g.map_dataframe(
lambda data, color: plt.scatter(
data["maltreat"], data["conf"],
c=np.where(data["maltreat"] > data["conf"], "red", "#1e81b0"),
alpha=0.6, s=18,
)
)
g.map_dataframe(add_diag)
g.set_titles("{col_name}")
g.set_xlabels('')
g.set_ylabels('')
# Add single figure-level labels
g.fig.text(0.5, 0.01, 'Treated Cases', ha='center', fontsize=24, fontweight='semibold') # x-axis label
g.fig.text(0.01, 0.5, 'Confirmed Cases', va='center', rotation='vertical', fontsize=24, fontweight='semibold') # y-axis label
for ax, (_, group) in zip(g.axes.ravel(), grouped.groupby("year")):
ax.set_facecolor('white')
facet_max = np.nanmax([group["maltreat"].max(), group["conf"].max()])
ax.set_xlim(0, facet_max)
ax.set_ylim(0, facet_max)
ax.title.set_fontsize(20)
ax.title.set_fontweight('bold')
ax.set_aspect('equal', adjustable='box')
ax.grid(True, color='lightgray', linestyle='-', linewidth=1)
ax.tick_params(axis='x', rotation=0, labelsize=18)
ax.tick_params(axis='y',labelsize=18)
g.fig.subplots_adjust(top=0.9, right=0.75, left=0.07, bottom=0.05)
g.fig.suptitle("Confirmed vs. Treated Cases by Year", fontsize=30, fontweight='bold')
g.fig.set_size_inches(plot_width, plot_height)
legend_elements = [
Line2D([0], [0], marker='o', color='w', label='Treated > Confirmed',
markerfacecolor='red', markersize=10),
Line2D([0], [0], marker='o', color='w', label='Confirmed ≥ Treated',
markerfacecolor='#1e81b0', markersize=10)
]
g.fig.legend(
handles=legend_elements,
loc='center left',
bbox_to_anchor=(0.73, 0.5), # Tweak as needed
fontsize=22,
frameon=False
)
# Save plot
outfolder = os.path.join("03_outputs", "coherency_checks")
os.makedirs(outfolder, exist_ok=True)
plt.savefig(os.path.join(outfolder, "confmaltreat_grid_year.png"), dpi=300, bbox_inches='tight')
plt.close()To adapt the code:
- Line 2: Change
clean_malaria_routine_data_finalto reflect the data frame you are plotting - Line 3: Replace
yearwith the variable you wish to stratify plots - Line 10: Replace
confandmaltreatwith the variables of the coherency check you wish to plot - Line 16: Replace
yearwith the variable you wish to stratify plots - Line 20-21: Replace
confandmaltreatwith the variables of the coherency check you wish to plot. Pay attention to the inequality sign based on the check you are performing! - Line 33-34: Modify the axes titles based on the variables being plotted
- Line 37: Replace
yearwith the variable you wish to stratify plots - Line 39: Replace
confandmaltreatwith the variables of the coherency check you wish to plot. - Line 50: Modify the plot title based on the variables being plotted
- Line 54-56: Modify the legend labels based on the variables being plotted
- Line 71: Replace
confmaltreat_grid_year.pngwith the desired file name
Step 3.2: Scatterplots by district
The example below subsets the plots by district, which allows for comparison across time and space. The plots are arranged into a grid and then exported.
This code follows the same logic as Step 3.1, but here we stratify by district using ggplot2::facet_wrap(~ adm1).
Show full code
confmaltreat_grid_adm1 <- clean_malaria_routine_data_final |>
dplyr::group_by(adm1) |>
dplyr::mutate(
facet_max = max(c(maltreat, conf), na.rm = TRUE) # Calculate per-facet max values
) |>
dplyr::ungroup() |>
ggplot2::ggplot(
ggplot2::aes(x = maltreat, y = conf)) +
ggplot2::geom_point(
ggplot2::aes(
color = ifelse(maltreat > conf, "red", "#1e81b0")),
alpha = 0.6, size = 1.5
) +
ggplot2::scale_color_identity(
name = "",
breaks = c("red", "#1e81b0"),
labels = c("Treated > Confirmed", "Confirmed ≥ Treated"),
guide = "legend"
) +
ggplot2::geom_abline(
slope = 1,
intercept = 0,
color = "black",
linetype = "dashed"
) +
ggplot2::facet_wrap(~ adm1,
ncol = 4,
scales = "free",
labeller = ggplot2::labeller(adm1 = function(x) gsub(" District", "", x))) +
ggplot2::labs(
title = "Confirmed vs. Treated Cases by District (adm1)",
x = "Treated Cases",
y = "Confirmed Cases"
) +
ggplot2::theme_minimal() +
ggplot2::theme(
plot.title = ggplot2::element_text(size = 24,
face = "bold",
hjust = 0.5),
axis.title.x = ggplot2::element_text(size = 22),
axis.title.y = ggplot2::element_text(size = 22),
legend.text = ggplot2::element_text(size = 20),
strip.text = ggplot2::element_text(size = 18,
face = "bold"),
legend.position = "right",
legend.margin = ggplot2::margin(l = 0.5, r = 2),
axis.text.x = ggplot2::element_text(angle = 90,
vjust = 0.5,
hjust = 1,
size = 16),
axis.text.y = ggplot2::element_text(size = 16)
) +
ggplot2::geom_blank(ggplot2::aes(x = facet_max,
y = facet_max)) +
ggplot2::geom_blank(ggplot2::aes(x = 0, y = 0))
# Calculate dynamic height based on number of adm1 units
n_adm1 <- length(unique(clean_malaria_routine_data_final$adm1))
plot_height <- 4 * ceiling(n_adm1 / 4) # 4 inches per row, 4 plots per row
# Save plot
ggplot2::ggsave(
"03_outputs/coherency_checks/confmaltreat_grid_adm1.png",
plot = confmaltreat_grid_adm1,
width = 16,
height = plot_height,
units = "in",
dpi = 300
)To adapt the code:
- Line 1: Change
confmaltreat_grid_yearandclean_malaria_routine_data_finalto reflect the variables and data frame you are plotting - Line 2: Replace
adm1with the variable you wish to stratify plots - Line 4: Replace
confandmaltreatwith the variables and coherency check you wish to plot - Line 8: Replace
confandmaltreatwith the variables and coherency check you wish to plot - Line 11: Replace
maltreat > confwith the variables and coherency check you wish to plot. Pay attention to the inequality sign based on the check you are performing! - Line 17: Modify the legend labels based on the variables being plotted
- Line 26: Replace
adm1with the variable you wish to stratify plots - Line 29: Delete this line of code or modify
function(x) gsub(" District", "", x)to adjust the subplot titles. In this example, ‘District’ is removed from the district name so that longer district names fit within the subset plot titles - Line 31-33: Modify the plot and axes titles based on the variables being plotted
- Line 58: Replace
clean_malaria_routine_data_final$adm1with thedataframe$stratifingvariableyour plot uses - Line 63: Replace
confmaltreat_grid_year.pngwith the desired file name - Line 64: Replace
confmaltreat_grid_adm1with the plot name defined in Line 1
This code follows the same logic as Step 3.1, but here we stratify by district using seaborn.FacetGrid.
Show full code
plt.style.use('bmh')
grouped = clean_malaria_routine_data_final.copy()
n_adm1 = grouped['adm1'].nunique()
plot_height = 4 * int(np.ceil(n_adm1 / 4)) # 4 inches/row, 4 columns
def add_diag(data, **kwargs):
facet_max = np.nanmax([data["maltreat"].max(), data["conf"].max()])
plt.plot([0, facet_max], [0, facet_max], 'k--', lw=1)
plt.xlim(0, facet_max)
plt.ylim(0, facet_max)
g = sns.FacetGrid(
grouped, col="adm1", col_wrap=4, height=4, sharex=False, sharey=False
)
g.map_dataframe(
lambda data, color: plt.scatter(
data["maltreat"], data["conf"],
c=np.where(data["maltreat"] > data["conf"], "red", "#1e81b0"),
alpha=0.6, s=18,
)
)
g.map_dataframe(add_diag)
# Remove individual subplot labels
g.set_xlabels('')
g.set_ylabels('')
g.set_titles("{col_name}")
# Remove "District" from subplot titles to accommodate long district names
for ax in g.axes.flat:
title = ax.get_title()
ax.set_title(title.replace(' District', ''))
# Add single figure-level labels
g.fig.text(0.5, 0.01, 'Treated Cases', ha='center', fontsize=24)
g.fig.text(0.01, 0.5, 'Confirmed Cases', va='center', rotation='vertical', fontsize=24)
for ax, (_, group) in zip(g.axes.ravel(), grouped.groupby("adm1")):
ax.set_facecolor('white')
facet_max = np.nanmax([group["maltreat"].max(), group["conf"].max()])
ax.set_xlim(0, facet_max)
ax.set_ylim(0, facet_max)
ax.title.set_fontsize(18)
ax.title.set_fontweight('bold')
ax.set_aspect('equal', adjustable='box')
ax.grid(True, color='lightgray', linestyle='-', linewidth=1)
ax.tick_params(axis='x', rotation=0, labelsize=16)
ax.tick_params(axis='y',labelsize=16)
g.fig.subplots_adjust(top=0.95, right=0.75, left=0.09, bottom=0.05)
g.fig.suptitle("Confirmed vs. Treated Cases by District (adm1)", fontsize=26, fontweight='bold')
legend_elements = [
Line2D([0], [0], marker='o', color='w', label='Treated > Confirmed',
markerfacecolor='red', markersize=10),
Line2D([0], [0], marker='o', color='w', label='Confirmed ≥ Treated',
markerfacecolor='#1e81b0', markersize=10)
]
g.fig.legend(
handles=legend_elements,
loc='center left',
bbox_to_anchor=(0.75, 0.5), # Tweak legend location as needed
fontsize=18,
frameon=False
)
# Save plot
outfolder = os.path.join("03_outputs", "coherency_checks")
os.makedirs(outfolder, exist_ok=True)
plt.savefig(os.path.join(outfolder, "confmaltreat_grid_adm1.png"), dpi=300, bbox_inches='tight')
plt.close()To adapt the code:
- Line 2: Change
clean_malaria_routine_data_finalto reflect the data frame you are plotting - Line 3: Replace
adm1with the variable you wish to stratify plots - Line 7: Replace
confandmaltreatwith the variables you wish to plot - Line 13: Replace
adm1with the variable you wish to stratify plots - Line 18-19: Replace
confandmaltreatwith the variables and coherency check you wish to plot. Pay attention to the inequality sign based on the check you are performing! - Line 32-34: Delete this code or modify
title.replace(' District', '')to adjust the subplot titles. In this example, ‘District’ is removed from the district name so that longer district names fit within the subset plot titles - Line 37-38: Modify the axes titles based on the variables being plotted
- Line 41: Replace
adm1with the variable you wish to stratify plots - Line 43: Replace
confandmaltreatwith the variables you wish to plot - Line 54: Modify the plot title based on the variables being plotted
- Line 57-59: Modify the legend labels based on the variables being plotted
- Line 74: Replace
confmaltreat_grid_year.pngwith the desired file name
Step 4: Save and export select rows
Rows of the data frame that fail the coherency check can be filtered, saved, and exported for further investigation and review by the SNT team.
To adapt the code:
- Line 1-3: Replace
clean_malaria_routine_data_final,conf, andmaltreatwith the data frame, x-variable, and y-variable of your coherency check - Line 8: Replace
"03_outputs/coherency_checks/maltreat_conf_incoherent_facilities.xlsx"to the desired output"folder/subfolder/filename"
To adapt the code:
- Line 2: Replace
clean_malaria_routine_data_final,conf, andmaltreatwith the data frame, x-variable, and y-variable of your coherency check - Line 6: Replace
"03_outputs/coherency_checks/maltreat_conf_incoherent_facilities.xlsx"to the desired output"folder/subfolder/filename"
Once the plots are made, the next step is to show and discuss with the SNT team to review patterns and validate findings. Failing a coherency check may indicate challenges with high-quality data collection and reporting, but may also indicate other factors. Specific delivery context (e.g., stockouts, outbreak responses, changes in diagnostic practice, etc.) may explain apparent discrepancies.
Flag indicators with incoherent trends for investigation and/or follow-up by the SNT team.
Step 5: Advanced data coherency visualizations
Step 5.1: Coherency summary
Additional visualizations can be created to evaluate data coherency across space and time. The code below generates summaries of the coherency checks applicable to the example data, which go on to be used in the subsequent visualizations.
Missing values are excluded from coherency calculations to distinguish between incomplete reporting (i.e. missing data) and illogical relationships (i.e. incoherent data). By only checking data where both values are present, we get a more accurate picture of data quality issues rather than data incompleteness. To ensure that missing values are excluded, the R code uses na.rm=TRUE and the Python code uses na.where() to explicitly handle missing values.
clean_malaria_routine_data_final <- clean_malaria_routine_data_final |>
dplyr::mutate(
# National/District checks
check_susp_test = susp >= test,
check_allout_susp = allout >= susp,
check_allout_test = allout >= test,
check_test_conf = test >= conf,
check_conf_treat = conf >= maltreat,
check_maladm_maldth = maladm >= maldth,
# Time-series checks (incoherency)
incoherent_susp_test = susp < test,
incoherent_allout_susp = allout < susp,
incoherent_allout_test = allout < test,
incoherent_test_conf = test < conf,
incoherent_conf_treat = conf < maltreat,
incoherent_maladm_maldth = maladm < maldth,
# Single date creation
date = lubridate::make_date(year, month, 1)
)To adapt the code:
- Line 1: Replace
clean_malaria_routine_data_finalwith the name of your data frame - Line 4-9: Modify list of checks based on the variables present in your data
- Line 12-17: Modify list of checks based on the variables present in your data
In python, Boolean operations will return False instead of NaN if a value is missing. This code uses np.where() to ensure that the Boolean operation returns NaN even when values are missing so that .mean() will automatically skip the NaN values.
df = clean_malaria_routine_data_final.copy()
# National / District checks
df['check_susp_test'] = np.where(
df['susp'].isna() | df['test'].isna(),np.nan,
df['susp'] >= df['test'])
df['check_allout_susp'] = np.where(
df['allout'].isna() | df['susp'].isna(),np.nan,
df['allout'] >= df['susp'])
df['check_allout_test'] = np.where(
df['allout'].isna() | df['test'].isna(),np.nan,
df['allout'] >= df['test'])
df['check_test_conf'] = np.where(
df['test'].isna() | df['conf'].isna(),np.nan,
df['test'] >= df['conf'])
df['check_conf_treat'] = np.where(
df['conf'].isna() | df['maltreat'].isna(),np.nan,
df['conf'] >= df['maltreat'])
df['check_maladm_maldth'] = np.where(
df['maladm'].isna() | df['maldth'].isna(),np.nan,
df['maladm'] >= df['maldth'])
# Time-series checks (incoherency)
df['incoherent_susp_test'] = np.where(
df['susp'].isna() | df['test'].isna(),np.nan,
df['susp'] < df['test'])
df['incoherent_allout_susp'] = np.where(
df['allout'].isna() | df['susp'].isna(),np.nan,
df['allout'] < df['susp'])
df['incoherent_allout_test'] = np.where(
df['allout'].isna() | df['test'].isna(),np.nan,
df['allout'] < df['test'])
df['incoherent_test_conf'] = np.where(
df['test'].isna() | df['conf'].isna(),np.nan,
df['test'] < df['conf'])
df['incoherent_conf_treat'] = np.where(
df['conf'].isna() | df['maltreat'].isna(),np.nan,
df['conf'] < df['maltreat'])
df['incoherent_maladm_maldth'] = np.where(
df['maladm'].isna() | df['maldth'].isna(),np.nan,
df['maladm'] < df['maldth'])
# Single date creation
df['date'] = pd.to_datetime(dict(year=df['year'], month=df['month'], day=1))
clean_malaria_routine_data_final = dfTo adapt the code:
- Line 1: Replace
clean_malaria_routine_data_finalwith the name of your data frame - Line 4-21: Modify list of checks based on the variables present in your data
- Line 24-41: Modify list of checks based on the variables present in your data
Step 5.2: National coherency visualization
The code below generates a visualization depicting the percentage of monthly reports from health facilities nationwide that passed each coherency check for each year. Each cell shows the proportion of all health facility monthly reports in that year that met the corresponding coherency rule. Assessing these checks across years and coherency check categories helps identify patterns in overall data quality.
Show full code
coherency_metrics <- clean_malaria_routine_data_final |>
dplyr::group_by(year) |>
dplyr::summarise(
dplyr::across(
tidyselect::starts_with("check_"),
~ mean(., na.rm = TRUE) * 100,
.names = "pct_{.col}"
),
.groups = "drop"
) |>
tidyr::pivot_longer(
cols = tidyselect::starts_with("pct_"),
names_to = "check_type",
names_prefix = "pct_check_",
values_to = "pct_coherent"
) |>
dplyr::mutate(
check_label = dplyr::case_when(
check_type == "susp_test" ~ "Suspected ≥ Tested",
check_type == "allout_susp" ~ "Outpatient ≥ Suspected",
check_type == "allout_test" ~ "Outpatient ≥ Tested",
check_type == "test_conf" ~ "Tested ≥ Confirmed",
check_type == "conf_treat" ~ "Confirmed ≥ Treated",
check_type == "maladm_maldth" ~ "Malaria admissions ≥ Malaria deaths"
),
check_label = forcats::fct_reorder(check_label, pct_coherent, median)
)
# Create plot
coherency_plot <- coherency_metrics |>
ggplot2::ggplot(ggplot2::aes(
x = factor(year),
y = check_label,
fill = pct_coherent
)) +
ggplot2::geom_tile(color = NA, linewidth = 0) +
ggplot2::geom_text(
ggplot2::aes(label = sprintf("%.0f%%", pct_coherent)),
color = "white",
size = 3.2,
fontface = "bold"
) +
viridis::scale_fill_viridis(
name = "% Coherent",
option = "viridis",
limits = c(0, 100),
direction = -1
) +
ggplot2::labs(
title = "National Data Coherency Checks",
x = "Year",
y = NULL
) +
ggplot2::theme_minimal() +
ggplot2::theme(
panel.grid = ggplot2::element_blank(),
axis.text.y = ggplot2::element_text(size = 10, hjust = 0),
axis.text.x = ggplot2::element_text(size = 9),
plot.caption = ggplot2::element_text(size = 8)
)
# Save plot
coherency_plot |>
ggplot2::ggsave(
filename = "03_outputs/coherency_checks/national_coherency_checks_heatmap.png",
width = 9,
height = 6.5,
dpi = 300
)To adapt the code:
- Line 1: Replace
clean_malaria_routine_data_finalwith the name of your data frame - Line 19-24: Modify list of checks based on the variables present in your data and listed in Step 5.1
Show full code
df = clean_malaria_routine_data_final.copy()
check_cols = [c for c in df if c.startswith('check_')]
national_coherency = (
df.groupby('year')[check_cols]
.mean() # skips NaNs by default
.multiply(100)
.reset_index()
)
national_long = national_coherency.melt(
id_vars=['year'],
value_vars=[c for c in national_coherency.columns if c != 'year'],
var_name='check_type',
value_name='pct_coherent'
)
# Map check_type to display label
check_label_map = {
"check_susp_test" : "Suspected ≥ Tested",
"check_allout_susp" : "Outpatient ≥ Suspected",
"check_allout_test" : "Outpatient ≥ Tested",
"check_test_conf" : "Tested ≥ Confirmed",
"check_conf_treat" : "Confirmed ≥ Treated",
"check_maladm_maldth": "Malaria admissions ≥ Malaria deaths"
}
national_long['check_label'] = national_long['check_type'].map(check_label_map)
# Reorder for plotting by overall median (optional, like forcats::fct_reorder)
cat_order = (national_long.groupby('check_label')['pct_coherent'].median()
.sort_values(ascending=False).index.tolist())
national_long['check_label'] = pd.Categorical(national_long['check_label'], categories=cat_order, ordered=True)
# Make heatmap
pivot = national_long.pivot(index='check_label', columns='year', values='pct_coherent')
annotations = pivot.round(0).astype(int).astype(str) + '%'
fig, ax = plt.subplots(figsize=(9, 6.5))
sns.heatmap(pivot, annot=annotations, fmt='', cmap='viridis_r', linewidths=0, ax=ax, cbar_kws={'label': '% Coherent'}, vmin=0, vmax=100)
ax.grid(False)
plt.title("National Data Coherency Checks", fontsize=16, weight='bold')
plt.xlabel("Year")
plt.ylabel("")
plt.tight_layout()
# Save plot
outfolder = os.path.join("03_outputs", "coherency_checks")
os.makedirs(outfolder, exist_ok=True)
plt.savefig(os.path.join(outfolder, "national_coherency_checks_heatmap.png"), dpi=300)
plt.close()To adapt the code:
- Line 1: Replace
clean_malaria_routine_data_finalwith the name of your data frame - Line 17-24: Modify list of checks based on the variables present in your data and listed in Step 5.1
Step 5.3: District coherency visualization
The code below focuses on one coherency check, continuing the example of confirmed vs. treated cases. However, more granularity is provided by stratifying the check by district in addition to year.
Show full code
adm_coherence <- clean_malaria_routine_data_final |>
dplyr::group_by(adm1, year) |>
dplyr::filter(dplyr::n() >= 5) |> # Minimum facilities filter
dplyr::summarise(
n_incoherent = sum(!check_conf_treat, na.rm = TRUE), # Count FALSE cases
total_hf = dplyr::n(),
pct_coherent = 100 * sum(check_conf_treat, na.rm = TRUE) / total_hf, # % TRUE
.groups = "drop"
) |>
dplyr::mutate(
adm1 = forcats::fct_reorder(adm1, pct_coherent, median)
)
# Create plot
adm_plot <- adm_coherence |>
ggplot2::ggplot(ggplot2::aes(
x = factor(year),
y = adm1,
fill = pct_coherent
)) +
ggplot2::geom_tile(color = NA, linewidth = 0) +
ggplot2::geom_text(
ggplot2::aes(label = sprintf("%.0f%%", pct_coherent)),
color = "white",
size = 2.8,
fontface = "bold"
) +
viridis::scale_fill_viridis(
name = "% Coherent\n(Confirmed ≥ Treated)",
option = "viridis",
limits = c(0, 100),
direction = -1
) +
ggplot2::labs(
title = "District-Level Data Coherency: Confirmed and Treated Cases",
x = "Year",
y = "District (adm1)",
) +
ggplot2::theme_minimal() +
ggplot2::theme(
panel.grid = ggplot2::element_blank(),
axis.text.y = ggplot2::element_text(size = 9),
axis.text.x = ggplot2::element_text(size = 9),
plot.subtitle = ggplot2::element_text(size = 9)
)
# Dynamic height based on number of districts
n_adm <- length(unique(adm_coherence$adm1))
plot_height <- max(6, 0.2 * n_adm + 2)
# Save plot
adm_plot |>
ggplot2::ggsave(
filename = "03_outputs/coherency_checks/adm_confmaltreat_heatmap.png",
width = 9,
height = plot_height,
dpi = 300
)To adapt the code:
- Line 1: Replace
clean_malaria_routine_data_finalwith the name of your data frame - Line 2: Replace
adm1with the administrative level you’d like to stratify by - Line 5, 7: Replace
check_conf_treatwith the check you wish to plot. Refer to the Step 5.1 code for the check options - Line 18: Replace
adm1withe the adminstrative level defined in Line 2 - Line 29: Modify the legend title based on the coherency check you are plotting
- Line 35: Modify plot title based on the coherency check and administrative level you are plotting
- Line 37: Modify the y-axis title based on the adminstrative level you are plotting
- Line 48: Replace
adm1with the administrative level defined in Line 2
Show full code
df = clean_malaria_routine_data_final.copy()
district = df.groupby(['adm1', 'year']).filter(lambda g: len(g) >= 5)
adm_coherence = (
district.groupby(['adm1', 'year']).agg(
n_incoherent = ('check_conf_treat', lambda x: (x == False).sum()), #count False values
total_hf = ('check_conf_treat', 'size'), #count all rows
n_coherent = ('check_conf_treat', lambda x: (x == True).sum()) #count True values
).reset_index()
)
adm_coherence['pct_coherent'] = (adm_coherence['n_coherent'] / adm_coherence['total_hf']) * 100
# Reorder adm1
cat_order_adm = (adm_coherence.groupby('adm1')['pct_coherent'].median()
.sort_values(ascending=False).index.tolist())
adm_coherence['adm1'] = pd.Categorical(adm_coherence['adm1'], categories=cat_order_adm, ordered=True)
pivot = adm_coherence.pivot(index='adm1', columns='year', values='pct_coherent')
n_adm = len(pivot)
plot_height = max(6, 0.2 * n_adm + 2)
fig, ax = plt.subplots(figsize=(9, plot_height))
annotations = pivot.round(0).astype(int).astype(str) + '%'
sns.heatmap(pivot, annot=annotations, fmt='', cmap='viridis_r', linewidths=0, ax=ax,cbar_kws={'label': '% Coherent\n(Confirmed ≥ Treated)'}, vmin=0, vmax=100)
plt.title("District-Level Data Coherency: Confirmed and Treated Cases", fontsize=14)
plt.xlabel("Year")
plt.ylabel("District (adm1)")
ax.grid(False)
plt.tight_layout()
# Save plot
outfolder = os.path.join("03_outputs", "coherency_checks")
os.makedirs(outfolder, exist_ok=True)
plt.savefig(os.path.join(outfolder, "adm_confmaltreat_heatmap.png"), dpi=300)
plt.close()To adapt the code:
- Line 1: Replace
clean_malaria_routine_data_finalwith the name of your data frame - Line 2,4: Replace
adm1with the administrative level you’d like to stratify by - Line 5-7: Replace
check_conf_treatwith the check you wish to plot. Refer to the Step 5.1 code for the check options - Line 13,15,17: Replace
adm1withe the adminstrative level defined in Line 2 - Line 22: Modify the legend title based on the coherency check you are plotting
- Line 23: Modify plot title based on the coherency check and administrative level you are plotting
- Line 25: Modify the y-axis title based on the adminstrative level you are plotting
Step 5.4: Coherency time series
A time series plots allows us to visualize how coherency fluctuates over time. The code below plots the number of incoherent monthly report across the years of observation.
Show full code
incoherency_monthly <- clean_malaria_routine_data_final |>
dplyr::group_by(year, month, date) |>
dplyr::summarise(
dplyr::across(
tidyselect::starts_with("incoherent_"),
~ sum(., na.rm = TRUE),
.names = "{.col}"
),
total_hf = dplyr::n(),
.groups = "drop"
) |>
tidyr::pivot_longer(
cols = tidyselect::starts_with("incoherent_"),
names_to = "check_type",
values_to = "n_incoherent"
) |>
dplyr::mutate(
check_label = dplyr::case_when(
check_type == "incoherent_susp_test" ~ "Suspected < Tested",
check_type == "incoherent_allout_susp" ~ "Outpatient < Suspected",
check_type == "incoherent_allout_test" ~ "Outpatient < Tested",
check_type == "incoherent_test_conf" ~ "Tested < Confirmed",
check_type == "incoherent_conf_treat" ~ "Confirmed < Treated",
check_type == "incoherent_maladm_maldth" ~ "Malaria admissions < Malaria deaths"
)
)
# Create plot
incoherency_plot <- incoherency_monthly |>
ggplot2::ggplot(ggplot2::aes(
x = date,
y = n_incoherent,
color = check_label,
group = check_label
)) +
ggplot2::geom_line(linewidth = 0.8) +
ggplot2::scale_x_date(
breaks = seq(
from = lubridate::floor_date(min(incoherency_monthly$date), "year"),
to = lubridate::ceiling_date(max(incoherency_monthly$date), "year"),
by = "1 year"
),
date_labels = "%Y",
expand = c(0.02, 0)
) +
scale_color_brewer(
name = "Incoherency",
palette = "Set2",
) +
ggplot2::labs(
title = "Monthly Incoherent Reports by Year",
subtitle = "Lines show monthly trends within each year",
x = "Year",
y = "Number of Incoherent Reports"
) +
ggplot2::theme_minimal() +
ggplot2::theme(
legend.position = "bottom",
legend.box.spacing = ggplot2::unit(0.2, "cm"),
panel.grid.minor.x = ggplot2::element_line(color = "gray90"),
legend.title = ggplot2::element_text(size=16),
legend.text = ggplot2::element_text(size = 14),
plot.title = ggplot2::element_text(size = 20, face = "bold"),
plot.subtitle = ggplot2::element_text(size = 18),
axis.title.x = ggplot2::element_text(size = 16, face = "bold"),
axis.title.y = ggplot2::element_text(size = 16, face = "bold"),
axis.text.x = ggplot2::element_text(size = 13),
axis.text.y = ggplot2::element_text(size = 13)
)
# Save plot
incoherency_plot |>
ggplot2::ggsave(
filename = "03_outputs/coherency_checks/incoherency_timeseries.png",
width = 10,
height = 6,
dpi = 300
)To adapt the code:
- Line 1: Replace
clean_malaria_routine_data_finalwith the name of your data frame - Line 19-24: Modify list of checks based on the variables present in your data and listed in Step 5.1
Show full code
# Aggregate data
df = clean_malaria_routine_data_final.copy()
incoherency_monthly = (
df.groupby(['year', 'month', 'date'])[
[c for c in df if c.startswith('incoherent_')]
].sum() # Remove min_count=1
.fillna(0) # Convert any remaining NaN to 0 (like R)
.reset_index()
)
incoherency_long = incoherency_monthly.melt(
id_vars=['year','month','date'],
value_vars=[c for c in incoherency_monthly if c.startswith('incoherent_')],
var_name='check_type',
value_name='n_incoherent'
)
# Assign labels for plotting
check_label_map_incoh = {
"incoherent_susp_test" : "Suspected < Tested",
"incoherent_allout_susp" : "Outpatient < Suspected",
"incoherent_allout_test" : "Outpatient < Tested",
"incoherent_test_conf" : "Tested < Confirmed",
"incoherent_conf_treat" : "Confirmed < Treated",
"incoherent_maladm_maldth": "Malaria admissions < Malaria deaths"
}
incoherency_long['check_label'] = incoherency_long['check_type'].map(check_label_map_incoh)
unique_labels = [label for label in incoherency_long['check_label'].dropna().unique()]
palette = sns.color_palette("Set2", len(unique_labels))
color_map = dict(zip(unique_labels, palette))
# Plotting
sns.set_style("white")
plt.figure(figsize=(10, 6));
for label, g in incoherency_long.groupby('check_label'):
plt.plot(
g['date'], g['n_incoherent'],
label=label,
lw=1.3,
color=color_map.get(label, "#333333") # fallback color if not found
)
plt.title("Monthly Incoherent Reports by Year", fontsize=18, weight='bold', y=1.05)
# adjust the location of the subtitle by changing the 0.52 value (0.5 is center)
plt.figtext(0.50, 0.9, "Lines show monthly trends within each year", ha='center', fontsize=16)
plt.xlabel("Year",fontsize = 15, weight='bold')
plt.ylabel("Number of Incoherent Reports", fontsize=15, weight='bold')
plt.tick_params(axis='x', labelsize=12)
plt.tick_params(axis='y', labelsize=12)
plt.legend(title="Incoherency", loc='upper center', bbox_to_anchor=(0.5, -0.18),
fontsize=14, ncol=3,title_fontsize = 16)
plt.grid(axis='both', color='lightgray', alpha=0.8)
plt.tight_layout()
# Save plot
outfolder = os.path.join("03_outputs", "coherency_checks")
os.makedirs(outfolder, exist_ok=True)
plt.savefig(os.path.join(outfolder, "incoherency_timeseries.png"), dpi=300)
plt.close()To adapt the code:
- Line 2: Replace
clean_malaria_routine_data_finalwith the name of your data frame - Line 18-23: Modify list of checks based on the variables present in your data and listed in Step 5.1
Summary
This page provides the steps to review data coherency using scatterplots that visualize one-to-one variable relationships in the context of SNT. Options for subsetting and exporting the coherency check are also provided.
Full code
Find the full code script for data coherency checks below.
Show full code
# Step 1.1: Install and load libraries
# Install pacman only if it's not already installed
if (!requireNamespace("pacman", quietly = TRUE)) {
install.packages("pacman")
}
# Install or load relevant packages
pacman::p_load(
readxl, # import and read Excel files
ggplot2, # plotting
writexl, # export Excel files
tidyr, # data management
dplyr, # data manipulation
forcats, # categorical data management
viridis, # colourblind-friendly palettes
lubridate # date management
)
# Step 1.2: Load data
clean_malaria_routine_data_final <-
rio::import(
here::here(
"1.2_epidemiology/1.2a_routine_surveillance/raw",
"clean_malaria_routine_data_final.xlsx"
)
)
# Step 2: Generate basic scatterplot
confmaltreat <- clean_malaria_routine_data_final |>
ggplot2::ggplot(ggplot2::aes(x = maltreat, y = conf)) +
ggplot2::geom_point(
ggplot2::aes(color = ifelse(maltreat > conf, "red", "#1e81b0")),
alpha = 0.6,
size = 2
) +
ggplot2::scale_color_identity(
name = "",
breaks = c("red", "#1e81b0"),
labels = c("Treated > Confirmed", "Confirmed ≥ Treated"),
guide = "legend"
) +
ggplot2::geom_abline(
slope = 1,
intercept = 0,
color = "black",
linetype = "dashed"
) +
ggplot2::labs(
title = "Confirmed vs. Treated Cases by Health Facility",
x = "Treated Cases",
y = "Confirmed Cases"
) +
ggplot2::theme_minimal() +
ggplot2::theme(
plot.title = ggplot2::element_text(size = 16,
face = "bold",
hjust = 0.5),
axis.title.x = ggplot2::element_text(size = 14),
axis.title.y = ggplot2::element_text(size = 14),
legend.text = ggplot2::element_text(size = 12),
strip.text = ggplot2::element_text(size = 12,
face = "bold"),
legend.position = "right",
axis.text.x = ggplot2::element_text(angle = 90,
vjust = 0.5,
hjust = 1)
) +
ggplot2::coord_equal() +
ggplot2::xlim(
0,
max(clean_malaria_routine_data_final$maltreat,
clean_malaria_routine_data_final$conf,
na.rm = TRUE)
) +
ggplot2::ylim(
0,
max(clean_malaria_routine_data_final$maltreat,
clean_malaria_routine_data_final$conf,
na.rm = TRUE)
)
# Save plot
ggplot2::ggsave(
"03_outputs/coherency_checks/confmaltreat_coherency_check.png",
plot = confmaltreat,
width = 12,
height = 8,
units = "in",
dpi = 300
)
# Step 3.1: Scatterplots by year
confmaltreat_grid_year <- clean_malaria_routine_data_final |>
dplyr::group_by(year) |>
dplyr::mutate(
facet_max = max(c(maltreat, conf), na.rm = TRUE) # Calculate per-facet max values
) |>
dplyr::ungroup() |>
ggplot2::ggplot(
ggplot2::aes(x = maltreat, y = conf)) +
ggplot2::geom_point(
ggplot2::aes(
color = ifelse(maltreat > conf, "red", "#1e81b0")),
alpha = 0.6, size = 1.5
) +
ggplot2::scale_color_identity(
name = "",
breaks = c("red", "#1e81b0"),
labels = c("Treated > Confirmed", "Confirmed ≥ Treated"),
guide = "legend"
) +
ggplot2::geom_abline(
slope = 1,
intercept = 0,
color = "black",
linetype = "dashed"
) +
ggplot2::facet_wrap(~ year,
ncol = 3,
scales = "free") +
ggplot2::labs(
title = "Confirmed vs. Treated Cases by Year",
x = "Treated Cases",
y = "Confirmed Cases"
) +
ggplot2::theme_minimal() +
ggplot2::theme(
plot.title = ggplot2::element_text(size = 22,
face = "bold",
hjust = 0.5),
axis.title.x = ggplot2::element_text(size = 18),
axis.title.y = ggplot2::element_text(size = 18),
legend.text = ggplot2::element_text(size = 16),
strip.text = ggplot2::element_text(size = 14,
face = "bold"),
legend.position = "right",
axis.text.x = ggplot2::element_text(angle = 90,
vjust = 0.5,
hjust = 1,
size = 12),
axis.text.y = ggplot2::element_text(size = 12)
) +
ggplot2::geom_blank(ggplot2::aes(x = facet_max,
y = facet_max)) +
ggplot2::geom_blank(ggplot2::aes(x = 0, y = 0))
# Calculate dynamic height based on years of data
n_groups <- length(unique(clean_malaria_routine_data_final$year))
plot_height <- 4 * ceiling(n_groups / 3) # 4 inches per row
# Save plot
ggplot2::ggsave(
"03_outputs/coherency_checks/confmaltreat_grid_year.png",
plot = confmaltreat_grid_year,
width = 12,
height = plot_height,
units = "in",
dpi = 300
)
# Step 3.2: Scatterplots by district
confmaltreat_grid_adm1 <- clean_malaria_routine_data_final |>
dplyr::group_by(adm1) |>
dplyr::mutate(
facet_max = max(c(maltreat, conf), na.rm = TRUE) # Calculate per-facet max values
) |>
dplyr::ungroup() |>
ggplot2::ggplot(
ggplot2::aes(x = maltreat, y = conf)) +
ggplot2::geom_point(
ggplot2::aes(
color = ifelse(maltreat > conf, "red", "#1e81b0")),
alpha = 0.6, size = 1.5
) +
ggplot2::scale_color_identity(
name = "",
breaks = c("red", "#1e81b0"),
labels = c("Treated > Confirmed", "Confirmed ≥ Treated"),
guide = "legend"
) +
ggplot2::geom_abline(
slope = 1,
intercept = 0,
color = "black",
linetype = "dashed"
) +
ggplot2::facet_wrap(~ adm1,
ncol = 4,
scales = "free",
labeller = ggplot2::labeller(adm1 = function(x) gsub(" District", "", x))) +
ggplot2::labs(
title = "Confirmed vs. Treated Cases by District (adm1)",
x = "Treated Cases",
y = "Confirmed Cases"
) +
ggplot2::theme_minimal() +
ggplot2::theme(
plot.title = ggplot2::element_text(size = 24,
face = "bold",
hjust = 0.5),
axis.title.x = ggplot2::element_text(size = 22),
axis.title.y = ggplot2::element_text(size = 22),
legend.text = ggplot2::element_text(size = 20),
strip.text = ggplot2::element_text(size = 18,
face = "bold"),
legend.position = "right",
legend.margin = ggplot2::margin(l = 0.5, r = 2),
axis.text.x = ggplot2::element_text(angle = 90,
vjust = 0.5,
hjust = 1,
size = 16),
axis.text.y = ggplot2::element_text(size = 16)
) +
ggplot2::geom_blank(ggplot2::aes(x = facet_max,
y = facet_max)) +
ggplot2::geom_blank(ggplot2::aes(x = 0, y = 0))
# Calculate dynamic height based on number of adm1 units
n_adm1 <- length(unique(clean_malaria_routine_data_final$adm1))
plot_height <- 4 * ceiling(n_adm1 / 4) # 4 inches per row, 4 plots per row
# Save plot
ggplot2::ggsave(
"03_outputs/coherency_checks/confmaltreat_grid_adm1.png",
plot = confmaltreat_grid_adm1,
width = 16,
height = plot_height,
units = "in",
dpi = 300
)
# Step 4: Save and export select rows
incoherent_facilities <- clean_malaria_routine_data_final[
clean_malaria_routine_data_final$conf >
clean_malaria_routine_data_final$maltreat, ]
writexl::write_xlsx(
incoherent_facilities,
here::here(
"03_outputs/coherency_checks/maltreat_conf_incoherent_facilities.xlsx")
)
# Step 5.1: Coherency summary
clean_malaria_routine_data_final <- clean_malaria_routine_data_final |>
dplyr::mutate(
# National/District checks
check_susp_test = susp >= test,
check_allout_susp = allout >= susp,
check_allout_test = allout >= test,
check_test_conf = test >= conf,
check_conf_treat = conf >= maltreat,
check_maladm_maldth = maladm >= maldth,
# Time-series checks (incoherency)
incoherent_susp_test = susp < test,
incoherent_allout_susp = allout < susp,
incoherent_allout_test = allout < test,
incoherent_test_conf = test < conf,
incoherent_conf_treat = conf < maltreat,
incoherent_maladm_maldth = maladm < maldth,
# Single date creation
date = lubridate::make_date(year, month, 1)
)
# Step 5.2: National coherency visualization
coherency_metrics <- clean_malaria_routine_data_final |>
dplyr::group_by(year) |>
dplyr::summarise(
dplyr::across(
tidyselect::starts_with("check_"),
~ mean(., na.rm = TRUE) * 100,
.names = "pct_{.col}"
),
.groups = "drop"
) |>
tidyr::pivot_longer(
cols = tidyselect::starts_with("pct_"),
names_to = "check_type",
names_prefix = "pct_check_",
values_to = "pct_coherent"
) |>
dplyr::mutate(
check_label = dplyr::case_when(
check_type == "susp_test" ~ "Suspected ≥ Tested",
check_type == "allout_susp" ~ "Outpatient ≥ Suspected",
check_type == "allout_test" ~ "Outpatient ≥ Tested",
check_type == "test_conf" ~ "Tested ≥ Confirmed",
check_type == "conf_treat" ~ "Confirmed ≥ Treated",
check_type == "maladm_maldth" ~ "Malaria admissions ≥ Malaria deaths"
),
check_label = forcats::fct_reorder(check_label, pct_coherent, median)
)
# Create plot
coherency_plot <- coherency_metrics |>
ggplot2::ggplot(ggplot2::aes(
x = factor(year),
y = check_label,
fill = pct_coherent
)) +
ggplot2::geom_tile(color = NA, linewidth = 0) +
ggplot2::geom_text(
ggplot2::aes(label = sprintf("%.0f%%", pct_coherent)),
color = "white",
size = 3.2,
fontface = "bold"
) +
viridis::scale_fill_viridis(
name = "% Coherent",
option = "viridis",
limits = c(0, 100),
direction = -1
) +
ggplot2::labs(
title = "National Data Coherency Checks",
x = "Year",
y = NULL
) +
ggplot2::theme_minimal() +
ggplot2::theme(
panel.grid = ggplot2::element_blank(),
axis.text.y = ggplot2::element_text(size = 10, hjust = 0),
axis.text.x = ggplot2::element_text(size = 9),
plot.caption = ggplot2::element_text(size = 8)
)
# Save plot
coherency_plot |>
ggplot2::ggsave(
filename = "03_outputs/coherency_checks/national_coherency_checks_heatmap.png",
width = 9,
height = 6.5,
dpi = 300
)
# Step 5.3: District coherency visualization
adm_coherence <- clean_malaria_routine_data_final |>
dplyr::group_by(adm1, year) |>
dplyr::filter(dplyr::n() >= 5) |> # Minimum facilities filter
dplyr::summarise(
n_incoherent = sum(!check_conf_treat, na.rm = TRUE), # Count FALSE cases
total_hf = dplyr::n(),
pct_coherent = 100 * sum(check_conf_treat, na.rm = TRUE) / total_hf, # % TRUE
.groups = "drop"
) |>
dplyr::mutate(
adm1 = forcats::fct_reorder(adm1, pct_coherent, median)
)
# Create plot
adm_plot <- adm_coherence |>
ggplot2::ggplot(ggplot2::aes(
x = factor(year),
y = adm1,
fill = pct_coherent
)) +
ggplot2::geom_tile(color = NA, linewidth = 0) +
ggplot2::geom_text(
ggplot2::aes(label = sprintf("%.0f%%", pct_coherent)),
color = "white",
size = 2.8,
fontface = "bold"
) +
viridis::scale_fill_viridis(
name = "% Coherent\n(Confirmed ≥ Treated)",
option = "viridis",
limits = c(0, 100),
direction = -1
) +
ggplot2::labs(
title = "District-Level Data Coherency: Confirmed and Treated Cases",
x = "Year",
y = "District (adm1)",
) +
ggplot2::theme_minimal() +
ggplot2::theme(
panel.grid = ggplot2::element_blank(),
axis.text.y = ggplot2::element_text(size = 9),
axis.text.x = ggplot2::element_text(size = 9),
plot.subtitle = ggplot2::element_text(size = 9)
)
# Dynamic height based on number of districts
n_adm <- length(unique(adm_coherence$adm1))
plot_height <- max(6, 0.2 * n_adm + 2)
# Save plot
adm_plot |>
ggplot2::ggsave(
filename = "03_outputs/coherency_checks/adm_confmaltreat_heatmap.png",
width = 9,
height = plot_height,
dpi = 300
)
# Step 5.4: Coherency time series
incoherency_monthly <- clean_malaria_routine_data_final |>
dplyr::group_by(year, month, date) |>
dplyr::summarise(
dplyr::across(
tidyselect::starts_with("incoherent_"),
~ sum(., na.rm = TRUE),
.names = "{.col}"
),
total_hf = dplyr::n(),
.groups = "drop"
) |>
tidyr::pivot_longer(
cols = tidyselect::starts_with("incoherent_"),
names_to = "check_type",
values_to = "n_incoherent"
) |>
dplyr::mutate(
check_label = dplyr::case_when(
check_type == "incoherent_susp_test" ~ "Suspected < Tested",
check_type == "incoherent_allout_susp" ~ "Outpatient < Suspected",
check_type == "incoherent_allout_test" ~ "Outpatient < Tested",
check_type == "incoherent_test_conf" ~ "Tested < Confirmed",
check_type == "incoherent_conf_treat" ~ "Confirmed < Treated",
check_type == "incoherent_maladm_maldth" ~ "Malaria admissions < Malaria deaths"
)
)
# Create plot
incoherency_plot <- incoherency_monthly |>
ggplot2::ggplot(ggplot2::aes(
x = date,
y = n_incoherent,
color = check_label,
group = check_label
)) +
ggplot2::geom_line(linewidth = 0.8) +
ggplot2::scale_x_date(
breaks = seq(
from = lubridate::floor_date(min(incoherency_monthly$date), "year"),
to = lubridate::ceiling_date(max(incoherency_monthly$date), "year"),
by = "1 year"
),
date_labels = "%Y",
expand = c(0.02, 0)
) +
scale_color_brewer(
name = "Coherency Check",
palette = "Set2",
) +
ggplot2::labs(
title = "Monthly Incoherent Reports by Year",
subtitle = "Lines show monthly trends within each year",
x = "Year",
y = "Number of Incoherent Reports"
) +
ggplot2::theme_minimal() +
ggplot2::theme(
legend.position = "bottom",
legend.box.spacing = ggplot2::unit(0.2, "cm"),
panel.grid.minor.x = ggplot2::element_line(color = "gray90"),
legend.title = ggplot2::element_text(size=16),
legend.text = ggplot2::element_text(size = 14),
plot.title = ggplot2::element_text(size = 20, face = "bold"),
plot.subtitle = ggplot2::element_text(size = 18),
axis.title.x = ggplot2::element_text(size = 16, face = "bold"),
axis.title.y = ggplot2::element_text(size = 16, face = "bold"),
axis.text.x = ggplot2::element_text(size = 13),
axis.text.y = ggplot2::element_text(size = 13)
)
# Save plot
incoherency_plot |>
ggplot2::ggsave(
filename = "03_outputs/coherency_checks/incoherency_timeseries.png",
width = 10,
height = 6,
dpi = 300
)Show full code
# Step 1.1: Install and load libraries
# Install packages (in your terminal, **not** in the code):
pip install pandas openpyxl matplotlib seaborn xlsxwriter numpy plotnine pyreadr
# Import packages
import pandas as pd # data manipulation and Excel
import matplotlib.pyplot as plt # plotting
import seaborn as sns # plotting, color palette
import numpy as np # numerics
import os # file paths
import pyreadr # reading rds files (optional)
from datetime import datetime # date management
from matplotlib.lines import Line2D # plotting
from pyhere import here # file paths
# Step 1.2: Load data
# Load data using pyhere
result = pyreadr.read_r(here("1.2_epidemiology", "1.2a_routine_surveillance", "raw", "clean_malaria_routine_data_final.rds"))
clean_malaria_routine_data_final = result[None]
# Step 2: Generate basic scatterplot
plt.style.use('bmh') # similar to theme_minimal() in R
plt.figure(figsize=(12, 12))
x = clean_malaria_routine_data_final["maltreat"]
y = clean_malaria_routine_data_final["conf"]
colors = np.where(x > y, "red", "#1e81b0")
plt.scatter(x, y, c=colors, alpha=0.6, s=70, label=None)
# Line: y=x
lims = [0, np.nanmax([x.max(), y.max()])]
_ = plt.plot(lims, lims, 'k--', lw=1) #underscore to suppress output
plt.grid(True, which='major', color='lightgray', linestyle='-', linewidth=1)
plt.gca().set_facecolor('white')
# Legends
from matplotlib.lines import Line2D
legend_elements = [
Line2D([0], [0], marker='o', color='w', label='Treated > Confirmed',
markerfacecolor='red', markersize=10),
Line2D([0], [0], marker='o', color='w', label='Confirmed ≥ Treated',
markerfacecolor='#1e81b0', markersize=10)
]
plt.legend(handles=legend_elements, loc='center right', title='', fontsize=22,facecolor='white')
plt.title("Confirmed vs. Treated Cases by Health Facility", fontsize=24, weight='bold')
plt.xlabel("Treated Cases", fontsize=22, weight='semibold')
plt.ylabel("Confirmed Cases", fontsize=22,weight='semibold')
plt.xticks(rotation=0, fontsize=18)
plt.yticks(fontsize=18)
plt.xlim(lims)
plt.ylim(lims)
plt.gca().set_aspect('equal', adjustable='box')
plt.tight_layout()
# Save plot
outfolder = os.path.join("03_outputs", "coherency_checks")
os.makedirs(outfolder, exist_ok=True)
plt.savefig(os.path.join(outfolder, "confmaltreat_coherency_check.png"), dpi=300)
plt.close()
# Step 3.1: Scatterplots by year
plt.style.use('bmh') # Minimal, similar to theme_minimal()
grouped = clean_malaria_routine_data_final.copy()
n_years = grouped['year'].nunique()
plot_height = 4.5 * int(np.ceil(n_years / 3)) # 4.5 inches/row, 3 columns
column_width = 5.5 # 5.5 inches/column
plot_width = column_width * 3 # Total width for 3 columns
def add_diag(data, **kwargs):
"""Draw y=x line scaled to each facet's data, similar to geom_blank + scales='free' in R."""
facet_max = np.nanmax([data["maltreat"].max(), data["conf"].max()])
plt.plot([0, facet_max], [0, facet_max], 'k--', lw=1)
plt.xlim(0, facet_max);
plt.ylim(0, facet_max);
g = sns.FacetGrid(
grouped, col="year", col_wrap=3, height=4, sharex=False, sharey=False
)
g.map_dataframe(
lambda data, color: plt.scatter(
data["maltreat"], data["conf"],
c=np.where(data["maltreat"] > data["conf"], "red", "#1e81b0"),
alpha=0.6, s=18,
)
)
g.map_dataframe(add_diag)
g.set_titles("{col_name}")
# Remove individual subplot labels first
g.set_xlabels('')
g.set_ylabels('')
# Add single figure-level labels
g.fig.text(0.5, 0.01, 'Treated Cases', ha='center', fontsize=24, fontweight='semibold') # x-axis label
g.fig.text(0.01, 0.5, 'Confirmed Cases', va='center', rotation='vertical', fontsize=24, fontweight='semibold') # y-axis label
# Per-facet x/y limits and grid + tick style
for ax, (_, group) in zip(g.axes.ravel(), grouped.groupby("year")):
ax.set_facecolor('white')
facet_max = np.nanmax([group["maltreat"].max(), group["conf"].max()])
ax.set_xlim(0, facet_max)
ax.set_ylim(0, facet_max)
ax.title.set_fontsize(20)
ax.title.set_fontweight('bold')
ax.set_aspect('equal', adjustable='box')
ax.grid(True, color='lightgray', linestyle='-', linewidth=1)
ax.tick_params(axis='x', rotation=0, labelsize=18)
ax.tick_params(axis='y',labelsize=18)
g.fig.subplots_adjust(top=0.9, right=0.75, left=0.07, bottom=0.05)
g.fig.suptitle("Confirmed vs. Treated Cases by Year", fontsize=30, fontweight='bold')
g.fig.set_size_inches(plot_width, plot_height)
legend_elements = [
Line2D([0], [0], marker='o', color='w', label='Treated > Confirmed',
markerfacecolor='red', markersize=10),
Line2D([0], [0], marker='o', color='w', label='Confirmed ≥ Treated',
markerfacecolor='#1e81b0', markersize=10)
]
g.fig.legend(
handles=legend_elements,
loc='center left',
bbox_to_anchor=(0.73, 0.5), # Tweak as needed
fontsize=22,
frameon=False
)
# Save plot
outfolder = os.path.join("03_outputs", "coherency_checks")
os.makedirs(outfolder, exist_ok=True)
plt.savefig(os.path.join(outfolder, "confmaltreat_grid_year.png"), dpi=300, bbox_inches='tight')
plt.close()
# Step 3.2: Scatterplots by district
plt.style.use('bmh')
grouped = clean_malaria_routine_data_final.copy()
n_adm1 = grouped['adm1'].nunique()
plot_height = 4 * int(np.ceil(n_adm1 / 4)) # 4 inches/row, 4 columns
def add_diag(data, **kwargs):
facet_max = np.nanmax([data["maltreat"].max(), data["conf"].max()])
plt.plot([0, facet_max], [0, facet_max], 'k--', lw=1)
plt.xlim(0, facet_max)
plt.ylim(0, facet_max)
g = sns.FacetGrid(
grouped, col="adm1", col_wrap=4, height=4, sharex=False, sharey=False
)
g.map_dataframe(
lambda data, color: plt.scatter(
data["maltreat"], data["conf"],
c=np.where(data["maltreat"] > data["conf"], "red", "#1e81b0"),
alpha=0.6, s=18,
)
)
g.map_dataframe(add_diag)
# Remove individual subplot labels
g.set_xlabels('')
g.set_ylabels('')
g.set_titles("{col_name}")
# Remove "District" from subplot titles to accommodate long district names
for ax in g.axes.flat:
title = ax.get_title()
ax.set_title(title.replace(' District', ''))
# Add single figure-level labels
g.fig.text(0.5, 0.01, 'Treated Cases', ha='center', fontsize=24)
g.fig.text(0.01, 0.5, 'Confirmed Cases', va='center', rotation='vertical', fontsize=24)
for ax, (_, group) in zip(g.axes.ravel(), grouped.groupby("adm1")):
ax.set_facecolor('white')
facet_max = np.nanmax([group["maltreat"].max(), group["conf"].max()])
ax.set_xlim(0, facet_max)
ax.set_ylim(0, facet_max)
ax.title.set_fontsize(18)
ax.title.set_fontweight('bold')
ax.set_aspect('equal', adjustable='box')
ax.grid(True, color='lightgray', linestyle='-', linewidth=1)
ax.tick_params(axis='x', rotation=0, labelsize=16)
ax.tick_params(axis='y',labelsize=16)
g.fig.subplots_adjust(top=0.95, right=0.75, left=0.09, bottom=0.05)
g.fig.suptitle("Confirmed vs. Treated Cases by District (adm1)", fontsize=26, fontweight='bold')
legend_elements = [
Line2D([0], [0], marker='o', color='w', label='Treated > Confirmed',
markerfacecolor='red', markersize=10),
Line2D([0], [0], marker='o', color='w', label='Confirmed ≥ Treated',
markerfacecolor='#1e81b0', markersize=10)
]
g.fig.legend(
handles=legend_elements,
loc='center left',
bbox_to_anchor=(0.75, 0.5), # Tweak legend location as needed
fontsize=18,
frameon=False
)
# Save plot
outfolder = os.path.join("03_outputs", "coherency_checks")
os.makedirs(outfolder, exist_ok=True)
plt.savefig(os.path.join(outfolder, "confmaltreat_grid_adm1.png"), dpi=300, bbox_inches='tight')
plt.close()
# Step 4: Save and export select rows
incoherent_facilities = clean_malaria_routine_data_final[
clean_malaria_routine_data_final['conf'] > clean_malaria_routine_data_final['maltreat']
]
outfolder = os.path.join("03_outputs", "coherency_checks")
os.makedirs(outfolder, exist_ok=True)
out_xlsx = os.path.join(outfolder, "maltreat_conf_incoherent_facilities.xlsx")
incoherent_facilities.to_excel(out_xlsx, index=False)
# Step 5.1: Coherency summary
df = clean_malaria_routine_data_final.copy()
# National / District checks
df['check_susp_test'] = np.where(
df['susp'].isna() | df['test'].isna(),np.nan,
df['susp'] >= df['test'])
df['check_allout_susp'] = np.where(
df['allout'].isna() | df['susp'].isna(),np.nan,
df['allout'] >= df['susp'])
df['check_allout_test'] = np.where(
df['allout'].isna() | df['test'].isna(),np.nan,
df['allout'] >= df['test'])
df['check_test_conf'] = np.where(
df['test'].isna() | df['conf'].isna(),np.nan,
df['test'] >= df['conf'])
df['check_conf_treat'] = np.where(
df['conf'].isna() | df['maltreat'].isna(),np.nan,
df['conf'] >= df['maltreat'])
df['check_maladm_maldth'] = np.where(
df['maladm'].isna() | df['maldth'].isna(),np.nan,
df['maladm'] >= df['maldth'])
# Time-series checks (incoherency)
df['incoherent_susp_test'] = np.where(
df['susp'].isna() | df['test'].isna(),np.nan,
df['susp'] < df['test'])
df['incoherent_allout_susp'] = np.where(
df['allout'].isna() | df['susp'].isna(),np.nan,
df['allout'] < df['susp'])
df['incoherent_allout_test'] = np.where(
df['allout'].isna() | df['test'].isna(),np.nan,
df['allout'] < df['test'])
df['incoherent_test_conf'] = np.where(
df['test'].isna() | df['conf'].isna(),np.nan,
df['test'] < df['conf'])
df['incoherent_conf_treat'] = np.where(
df['conf'].isna() | df['maltreat'].isna(),np.nan,
df['conf'] < df['maltreat'])
df['incoherent_maladm_maldth'] = np.where(
df['maladm'].isna() | df['maldth'].isna(),np.nan,
df['maladm'] < df['maldth'])
# Single date creation
df['date'] = pd.to_datetime(dict(year=df['year'], month=df['month'], day=1))
clean_malaria_routine_data_final = df
# Step 5.2: National coherency visualization
df = clean_malaria_routine_data_final.copy()
check_cols = [c for c in df if c.startswith('check_')]
national_coherency = (
df.groupby('year')[check_cols]
.mean() # skips NaNs by default
.multiply(100)
.reset_index()
)
national_long = national_coherency.melt(
id_vars=['year'],
value_vars=[c for c in national_coherency.columns if c != 'year'],
var_name='check_type',
value_name='pct_coherent'
)
# Map check_type to display label
check_label_map = {
"check_susp_test" : "Suspected ≥ Tested",
"check_allout_susp" : "Outpatient ≥ Suspected",
"check_allout_test" : "Outpatient ≥ Tested",
"check_test_conf" : "Tested ≥ Confirmed",
"check_conf_treat" : "Confirmed ≥ Treated",
"check_maladm_maldth": "Malaria admissions ≥ Malaria deaths"
}
national_long['check_label'] = national_long['check_type'].map(check_label_map)
# Reorder for plotting by overall median (optional, like forcats::fct_reorder)
cat_order = (national_long.groupby('check_label')['pct_coherent'].median()
.sort_values(ascending=False).index.tolist())
national_long['check_label'] = pd.Categorical(national_long['check_label'], categories=cat_order, ordered=True)
# Make heatmap
pivot = national_long.pivot(index='check_label', columns='year', values='pct_coherent')
annotations = pivot.round(0).astype(int).astype(str) + '%'
fig, ax = plt.subplots(figsize=(9, 6.5))
sns.heatmap(pivot, annot=annotations, fmt='', cmap='viridis_r', linewidths=0, ax=ax, cbar_kws={'label': '% Coherent'}, vmin=0, vmax=100, annot_kws={'weight':'bold'})
ax.grid(False)
plt.title("National Data Coherency Checks", fontsize=16, weight='semibold',pad=20)
plt.xlabel("Year", fontsize = 16)
plt.ylabel("")
ax.tick_params(axis='y', labelsize=12)
ax.tick_params(axis='x', labelsize=12)
# Colorbar label size
cbar = ax.collections[0].colorbar
cbar.set_label('% Coherent', fontsize=16)
cbar.ax.tick_params(labelsize=12) # Colorbar tick labels
plt.tight_layout()
# Save plot
outfolder = os.path.join("03_outputs", "coherency_checks")
os.makedirs(outfolder, exist_ok=True)
plt.savefig(os.path.join(outfolder, "national_coherency_checks_heatmap.png"), dpi=300)
plt.close()
# Step 5.3: District coherency visualization
df = clean_malaria_routine_data_final.copy()
district = df.groupby(['adm1', 'year']).filter(lambda g: len(g) >= 5)
adm_coherence = (
district.groupby(['adm1', 'year']).agg(
n_incoherent = ('check_conf_treat', lambda x: (x == False).sum()), #count False values
total_hf = ('check_conf_treat', 'size'), #count all rows
n_coherent = ('check_conf_treat', lambda x: (x == True).sum()) #count True values
).reset_index()
)
adm_coherence['pct_coherent'] = (adm_coherence['n_coherent'] / adm_coherence['total_hf']) * 100
# Reorder adm1
cat_order_adm = (adm_coherence.groupby('adm1')['pct_coherent'].median()
.sort_values(ascending=False).index.tolist())
adm_coherence['adm1'] = pd.Categorical(adm_coherence['adm1'], categories=cat_order_adm, ordered=True)
pivot = adm_coherence.pivot(index='adm1', columns='year', values='pct_coherent')
n_adm = len(pivot)
plot_height = max(6, 0.2 * n_adm + 2)
fig, ax = plt.subplots(figsize=(9, plot_height))
annotations = pivot.round(0).astype(int).astype(str) + '%'
sns.heatmap(pivot, annot=annotations, fmt='', cmap='viridis_r', linewidths=0, ax=ax,cbar_kws={'label': '% Coherent\n(Confirmed ≥ Treated)'}, vmin=0, vmax=100,annot_kws={'weight':'semibold'})
plt.title("District-Level Data Coherency: Confirmed and Treated Cases", fontsize=16,weight='semibold',pad=20)
plt.xlabel("Year",fontsize=14)
plt.ylabel("District (adm1)",fontsize=14)
ax.grid(False)
ax.tick_params(axis='y', labelsize=12)
ax.tick_params(axis='x', labelsize=12)
cbar = ax.collections[0].colorbar
cbar.set_label('% Coherent\n(Confirmed ≥ Treated)', fontsize=14)
cbar.ax.tick_params(labelsize=12) # Colorbar tick labels
plt.tight_layout()
# Save plot
outfolder = os.path.join("03_outputs", "coherency_checks")
os.makedirs(outfolder, exist_ok=True)
plt.savefig(os.path.join(outfolder, "adm_confmaltreat_heatmap.png"), dpi=300)
plt.close()
# Step 5.4: Coherency time series
# Aggregate data
df = clean_malaria_routine_data_final.copy()
incoherency_monthly = (
df.groupby(['year', 'month', 'date'])[
[c for c in df if c.startswith('incoherent_')]
].sum()
.fillna(0)
.reset_index()
)
incoherency_long = incoherency_monthly.melt(
id_vars=['year','month','date'],
value_vars=[c for c in incoherency_monthly if c.startswith('incoherent_')],
var_name='check_type',
value_name='n_incoherent'
)
# Assign labels for plotting
check_label_map_incoh = {
"incoherent_susp_test" : "Suspected < Tested",
"incoherent_allout_susp" : "Outpatient < Suspected",
"incoherent_allout_test" : "Outpatient < Tested",
"incoherent_test_conf" : "Tested < Confirmed",
"incoherent_conf_treat" : "Confirmed < Treated",
"incoherent_maladm_maldth": "Malaria admissions < Malaria deaths"
}
incoherency_long['check_label'] = incoherency_long['check_type'].map(check_label_map_incoh)
unique_labels = [label for label in incoherency_long['check_label'].dropna().unique()]
palette = sns.color_palette("Set2", len(unique_labels))
color_map = dict(zip(unique_labels, palette))
# Plotting
sns.set_style("white")
plt.figure(figsize=(10, 6))
for label, g in incoherency_long.groupby('check_label'):
plt.plot(
g['date'], g['n_incoherent'],
label=label,
lw=1.3,
color=color_map.get(label, "#333333") # fallback color if not found
)
plt.title("Monthly Incoherent Reports by Year", fontsize=18, weight='bold', y=1.05);
plt.figtext(0.50, 0.9, "Lines show monthly trends within each year", ha='center', fontsize=16);
plt.xlabel("Year",fontsize = 15, weight='bold');
plt.ylabel("Number of Incoherent Reports", fontsize=15, weight='bold');
plt.tick_params(axis='x', labelsize=12);
plt.tick_params(axis='y', labelsize=12);
plt.legend(title="Incoherency", loc='upper center', bbox_to_anchor=(0.5, -0.18),
fontsize=14, ncol=3,title_fontsize = 16);
plt.grid(axis='both', color='lightgray', alpha=0.8);
plt.tight_layout()
# Save plot
outfolder = os.path.join("03_outputs", "coherency_checks")
os.makedirs(outfolder, exist_ok=True)
plt.savefig(os.path.join(outfolder, "incoherency_timeseries.png"), dpi=300)
plt.close()