import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.units import inch
import os
[docs]def generate_qc_report(
validation_results,
missing_data,
flagged_records_count,
mapping_success_rates,
visualization_images,
impute_strategy,
quality_scores,
output_path_or_buffer,
report_format='pdf',
file_identifier=None
):
"""
Generates a quality control report (PDF or Markdown).
No changes to other files are required.
"""
if report_format == 'pdf':
styles = getSampleStyleSheet()
story = []
# Title
story.append(Paragraph("PhenoQC Quality Control Report", styles['Title']))
story.append(Spacer(1, 12))
if file_identifier:
story.append(Paragraph(f"<b>Source file:</b> {file_identifier}", styles['Normal']))
story.append(Spacer(1, 12))
# Imputation Strategy
story.append(Paragraph("Imputation Strategy Used:", styles['Heading2']))
strategy_display = "(No Imputation Strategy)" if impute_strategy is None else impute_strategy.capitalize()
story.append(Paragraph(strategy_display, styles['Normal']))
story.append(Spacer(1, 12))
# Data Quality Scores
story.append(Paragraph("Data Quality Scores:", styles['Heading2']))
for score_name, score_value in quality_scores.items():
story.append(Paragraph(f"<b>{score_name}:</b> {score_value:.2f}%", styles['Normal']))
story.append(Spacer(1, 12))
# Schema Validation Results
story.append(Paragraph("Schema Validation Results:", styles['Heading2']))
for key, value in validation_results.items():
if isinstance(value, pd.DataFrame):
if not value.empty:
story.append(Paragraph(
f"<b>{key}:</b> {len(value)} issues found.",
styles['Normal']
))
else:
story.append(Paragraph(
f"<b>{key}:</b> No issues found.",
styles['Normal']
))
else:
story.append(Paragraph(f"<b>{key}:</b> {value}", styles['Normal']))
story.append(Spacer(1, 12))
# Missing Data Summary
story.append(Paragraph("Missing Data Summary:", styles['Heading2']))
for column, count in missing_data.items():
story.append(Paragraph(f"<b>{column}:</b> {count} missing values", styles['Normal']))
story.append(Spacer(1, 12))
# Records Flagged for Missing Data
story.append(Paragraph(f"<b>Records Flagged for Missing Data:</b> {flagged_records_count}", styles['Normal']))
story.append(Spacer(1, 12))
# Ontology Mapping Success Rates
story.append(Paragraph("Ontology Mapping Success Rates:", styles['Heading2']))
for ontology_id, stats in mapping_success_rates.items():
story.append(Paragraph(f"{ontology_id}:", styles['Heading3']))
story.append(Paragraph(f"<b>Total Terms:</b> {stats['total_terms']}", styles['Normal']))
story.append(Paragraph(f"<b>Mapped Terms:</b> {stats['mapped_terms']}", styles['Normal']))
story.append(Paragraph(f"<b>Success Rate:</b> {stats['success_rate']:.2f}%", styles['Normal']))
story.append(Spacer(1, 12))
# Visualizations
story.append(Paragraph("Visualizations:", styles['Heading2']))
for image_path in visualization_images:
if os.path.exists(image_path):
# Increase figure size to ensure labels are visible
img = Image(image_path, width=6.5 * inch, height=5 * inch)
story.append(img)
story.append(Spacer(1, 12))
else:
story.append(Paragraph(f"Image not found: {image_path}", styles['Normal']))
if isinstance(output_path_or_buffer, str):
doc = SimpleDocTemplate(output_path_or_buffer, pagesize=letter)
else:
doc = SimpleDocTemplate(output_path_or_buffer, pagesize=letter)
doc.build(story)
elif report_format == 'md':
md_lines = []
md_lines.append("# PhenoQC Quality Control Report\n")
md_lines.append("## Imputation Strategy Used")
md_lines.append(f"{impute_strategy.capitalize() if impute_strategy else '(No Imputation Strategy)'}\n")
md_lines.append("\n")
md_lines.append("## Data Quality Scores")
for score_name, score_value in quality_scores.items():
md_lines.append(f"- **{score_name}**: {score_value:.2f}%")
md_lines.append("")
md_lines.append("## Schema Validation Results")
for key, value in validation_results.items():
if isinstance(value, pd.DataFrame):
if not value.empty:
md_lines.append(f"- **{key}**: {len(value)} issues found.")
else:
md_lines.append(f"- **{key}**: No issues found.")
else:
md_lines.append(f"- **{key}**: {value}")
md_lines.append("")
md_lines.append("## Missing Data Summary")
for column, count in missing_data.items():
md_lines.append(f"- **{column}**: {count} missing values")
md_lines.append("")
md_lines.append(f"**Records Flagged for Missing Data**: {flagged_records_count}\n")
md_lines.append("## Ontology Mapping Success Rates")
for ontology_id, stats in mapping_success_rates.items():
md_lines.append(f"### {ontology_id}")
md_lines.append(f"- **Total Terms**: {stats['total_terms']}")
md_lines.append(f"- **Mapped Terms**: {stats['mapped_terms']}")
md_lines.append(f"- **Success Rate**: {stats['success_rate']:.2f}%")
md_lines.append("")
md_lines.append("## Visualizations")
for image_path in visualization_images:
image_filename = os.path.basename(image_path)
md_lines.append(f"")
md_lines.append("")
if isinstance(output_path_or_buffer, str):
with open(output_path_or_buffer, 'w') as f:
f.write('\n'.join(md_lines))
else:
output_path_or_buffer.write('\n'.join(md_lines).encode('utf-8'))
else:
raise ValueError("Unsupported report format. Use 'pdf' or 'md'.")
[docs]def create_visual_summary(df, phenotype_columns=None, output_image_path=None):
"""
Creates visual summaries with extra steps to keep axis labels fully visible:
1) Missingness Heatmap (white/blue)
2) Bar plot of % missing per column
3) Numeric histograms ignoring ID columns
4) Optional bar/pie charts for phenotype columns
"""
# Check for proper DataFrame input
if not isinstance(df, pd.DataFrame):
raise TypeError(
f"create_visual_summary() expects a pandas DataFrame, but got {type(df)}."
)
figs = []
# 1) Missingness visuals
if not df.empty:
# (a) Heatmap
figs.append(create_missingness_heatmap(df))
# (b) Missing distribution
figs.append(create_missingness_distribution(df))
# (c) Numeric histograms
possible_ids = [c for c in df.columns if "id" in c.lower()]
figs.extend(create_numeric_histograms(df, unique_id_cols=possible_ids))
# 2) Phenotype-based plots
if phenotype_columns:
for column, ontologies in phenotype_columns.items():
if column not in df.columns:
continue
non_null_values = df[column].dropna()
if len(non_null_values) == 0:
continue
phenotype_counts = non_null_values.value_counts().head(20)
fig_bar = px.bar(
phenotype_counts,
labels={'index': 'Phenotype Term', 'value': 'Count'},
title=f"Top 20 Most Common Terms in {column}",
template='plotly_white'
)
fig_bar.update_layout(
plot_bgcolor="#FFFFFF",
paper_bgcolor="#FFFFFF",
font={'color': "#2C3E50", 'size': 12},
title={
'text': f"Top 20 Most Common Terms in {column}",
'y': 0.97, 'x': 0.45,
'xanchor': 'center',
'yanchor': 'top',
'font': {'size': 16}
},
showlegend=False,
width=1200,
height=700,
margin=dict(t=120, b=200, l=140, r=120),
bargap=0.25
)
fig_bar.update_xaxes(
tickangle=60,
automargin=True,
tickfont={'size': 10},
ticktext=[
f"{text[:40]}..." if len(text) > 40 else text
for text in phenotype_counts.index
],
tickvals=list(range(len(phenotype_counts))),
showticklabels=True,
tickmode='array'
)
figs.append(fig_bar)
for onto_id in ontologies:
mapped_col = f"{onto_id}_ID"
if mapped_col not in df.columns:
continue
valid_terms = ~df[column].isin([
'NotARealTerm','ZZZZ:9999999','PhenotypeJunk','InvalidTerm42'
])
total = df[column].notna() & valid_terms
total_count = total.sum()
mapped = df[mapped_col].notna() & total
mapped_count = mapped.sum()
unmapped_count = total_count - mapped_count
fig_pie = go.Figure(data=[go.Pie(
labels=['Mapped', 'Unmapped'],
values=[mapped_count, unmapped_count],
hole=0.4,
marker=dict(colors=['#4C72B0', '#DD8452']),
textinfo='label+percent',
textposition='outside',
textfont={'size': 14},
hovertemplate="<b>%{label}</b><br>Count: %{value}"
"<br>Percentage: %{percent}<extra></extra>"
)])
fig_pie.update_layout(
title={
'text': f"Mapping Results: {column} → {onto_id}",
'y': 0.95,
'x': 0.5,
'xanchor': 'center',
'yanchor': 'top',
'font': {'size': 16}
},
annotations=[{
'text': (
f"Total Valid Terms: {total_count}<br>"
f"Mapped: {mapped_count} "
f"({(mapped_count / total_count * 100 if total_count else 0):.1f}%)<br>"
f"Unmapped: {unmapped_count} "
f"({(unmapped_count / total_count * 100 if total_count else 0):.1f}%)"
),
'x': 0.5,
'y': -0.2,
'showarrow': False,
'font': {'size': 12}
}],
showlegend=True,
legend={
'orientation': 'h',
'yanchor': 'bottom',
'y': -0.3,
'xanchor': 'center',
'x': 0.5
},
width=900,
height=700,
plot_bgcolor="#FFFFFF",
paper_bgcolor="#FFFFFF",
font={'color': "#2C3E50"},
margin=dict(t=120, b=180, l=100, r=100)
)
figs.append(fig_pie)
return figs
[docs]def create_missingness_distribution(df):
"""
Returns a bar chart showing percent missingness per column.
"""
missing_count = df.isna().sum()
missing_percent = (missing_count / len(df)) * 100
data = pd.DataFrame({
"column": missing_count.index,
"percent_missing": missing_percent
}).sort_values("percent_missing", ascending=True)
fig = px.bar(
data,
x="percent_missing",
y="column",
orientation="h",
title="Percentage of Missing Data by Column",
template="plotly_white",
color_discrete_sequence=["#d62728"]
)
fig.update_layout(
height=500,
width=800,
margin=dict(l=120, r=80, t=60, b=60),
font=dict(size=12)
)
fig.update_xaxes(title_text="Percent Missing", automargin=True)
fig.update_yaxes(title_text="Columns", automargin=True)
return fig
[docs]def create_missingness_heatmap(df):
"""
Generates a missingness heatmap with exactly two colors:
White for present (0) and a pleasing blue (#3B82F6) for missing (1).
"""
missing_matrix = df.isna().astype(int)
col_order = missing_matrix.sum().sort_values(ascending=False).index
missing_matrix = missing_matrix[col_order]
two_color_scale = [(0.0, "white"), (1.0, "#3B82F6")]
# Build the base heatmap
fig = px.imshow(
missing_matrix,
zmin=0,
zmax=1,
color_continuous_scale=two_color_scale,
labels={"color": "Missing"},
aspect="auto",
title="Missingness Heatmap"
)
# Bump the figure size
fig.update_layout(
height=800,
width=1200,
# Extra space for big labels & a lower-located chart title
margin=dict(l=130, r=130, t=180, b=200),
font=dict(size=12),
xaxis=dict(side="top"),
)
# Move the chart title downward so it's clearly separate from x-labels
fig.update_layout(
title=dict(
text="Missingness Heatmap",
x=0.5,
y=0.90, # Move the title down a bit more
xanchor="center",
yanchor="bottom"
)
)
# Increase standoff for the x-axis label
fig.update_xaxes(
title=dict(text="Columns", standoff=70),
tickangle=80, # or 90 to make them vertical
automargin=True
)
# Extra standoff for y-axis label if needed
fig.update_yaxes(
title=dict(text="Rows", standoff=20),
automargin=True
)
return fig
[docs]def create_numeric_histograms(df, unique_id_cols=None, max_cols=5):
"""
Creates histogram figures for numeric columns, ignoring any columns
that appear in `unique_id_cols` (if provided).
"""
if unique_id_cols is None:
unique_id_cols = []
numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
numeric_cols = [col for col in numeric_cols if col not in unique_id_cols]
numeric_cols = numeric_cols[:max_cols]
figs = []
for col in numeric_cols:
fig = px.histogram(
df,
x=col,
nbins=30,
title=f"Distribution of {col}",
template="plotly_white",
color_discrete_sequence=["#1f77b4"]
)
fig.update_layout(
height=400,
width=600,
margin=dict(l=60, r=60, t=60, b=60),
font=dict(size=12),
)
figs.append(fig)
return figs