Source code for src.reporting

import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.units import inch
import os

[docs]def generate_qc_report(
    validation_results,
    missing_data,
    flagged_records_count,
    mapping_success_rates,
    visualization_images,
    impute_strategy,
    quality_scores,
    output_path_or_buffer,
    report_format='pdf',
    file_identifier=None
):
    """
    Generates a quality control report (PDF or Markdown).
    No changes to other files are required.
    """
    if report_format == 'pdf':
        styles = getSampleStyleSheet()
        story = []

        # Title
        story.append(Paragraph("PhenoQC Quality Control Report", styles['Title']))
        story.append(Spacer(1, 12))

        if file_identifier:
            story.append(Paragraph(f"<b>Source file:</b> {file_identifier}", styles['Normal']))
            story.append(Spacer(1, 12))

        # Imputation Strategy
        story.append(Paragraph("Imputation Strategy Used:", styles['Heading2']))
        strategy_display = "(No Imputation Strategy)" if impute_strategy is None else impute_strategy.capitalize()
        story.append(Paragraph(strategy_display, styles['Normal']))
        story.append(Spacer(1, 12))

        # Data Quality Scores
        story.append(Paragraph("Data Quality Scores:", styles['Heading2']))
        for score_name, score_value in quality_scores.items():
            story.append(Paragraph(f"<b>{score_name}:</b> {score_value:.2f}%", styles['Normal']))
        story.append(Spacer(1, 12))

        # Schema Validation Results
        story.append(Paragraph("Schema Validation Results:", styles['Heading2']))
        for key, value in validation_results.items():
            if isinstance(value, pd.DataFrame):
                if not value.empty:
                    story.append(Paragraph(
                        f"<b>{key}:</b> {len(value)} issues found.",
                        styles['Normal']
                    ))
                else:
                    story.append(Paragraph(
                        f"<b>{key}:</b> No issues found.",
                        styles['Normal']
                    ))
            else:
                story.append(Paragraph(f"<b>{key}:</b> {value}", styles['Normal']))
        story.append(Spacer(1, 12))

        # Missing Data Summary
        story.append(Paragraph("Missing Data Summary:", styles['Heading2']))
        for column, count in missing_data.items():
            story.append(Paragraph(f"<b>{column}:</b> {count} missing values", styles['Normal']))
        story.append(Spacer(1, 12))

        # Records Flagged for Missing Data
        story.append(Paragraph(f"<b>Records Flagged for Missing Data:</b> {flagged_records_count}", styles['Normal']))
        story.append(Spacer(1, 12))

        # Ontology Mapping Success Rates
        story.append(Paragraph("Ontology Mapping Success Rates:", styles['Heading2']))
        for ontology_id, stats in mapping_success_rates.items():
            story.append(Paragraph(f"{ontology_id}:", styles['Heading3']))
            story.append(Paragraph(f"<b>Total Terms:</b> {stats['total_terms']}", styles['Normal']))
            story.append(Paragraph(f"<b>Mapped Terms:</b> {stats['mapped_terms']}", styles['Normal']))
            story.append(Paragraph(f"<b>Success Rate:</b> {stats['success_rate']:.2f}%", styles['Normal']))
            story.append(Spacer(1, 12))

        # Visualizations
        story.append(Paragraph("Visualizations:", styles['Heading2']))
        for image_path in visualization_images:
            if os.path.exists(image_path):
                # Increase figure size to ensure labels are visible
                img = Image(image_path, width=6.5 * inch, height=5 * inch)
                story.append(img)
                story.append(Spacer(1, 12))
            else:
                story.append(Paragraph(f"Image not found: {image_path}", styles['Normal']))

        if isinstance(output_path_or_buffer, str):
            doc = SimpleDocTemplate(output_path_or_buffer, pagesize=letter)
        else:
            doc = SimpleDocTemplate(output_path_or_buffer, pagesize=letter)
        doc.build(story)

    elif report_format == 'md':
        md_lines = []
        md_lines.append("# PhenoQC Quality Control Report\n")
        md_lines.append("## Imputation Strategy Used")
        md_lines.append(f"{impute_strategy.capitalize() if impute_strategy else '(No Imputation Strategy)'}\n")
        md_lines.append("\n")
        md_lines.append("## Data Quality Scores")
        for score_name, score_value in quality_scores.items():
            md_lines.append(f"- **{score_name}**: {score_value:.2f}%")
        md_lines.append("")
        md_lines.append("## Schema Validation Results")
        for key, value in validation_results.items():
            if isinstance(value, pd.DataFrame):
                if not value.empty:
                    md_lines.append(f"- **{key}**: {len(value)} issues found.")
                else:
                    md_lines.append(f"- **{key}**: No issues found.")
            else:
                md_lines.append(f"- **{key}**: {value}")
        md_lines.append("")
        md_lines.append("## Missing Data Summary")
        for column, count in missing_data.items():
            md_lines.append(f"- **{column}**: {count} missing values")
        md_lines.append("")
        md_lines.append(f"**Records Flagged for Missing Data**: {flagged_records_count}\n")
        md_lines.append("## Ontology Mapping Success Rates")
        for ontology_id, stats in mapping_success_rates.items():
            md_lines.append(f"### {ontology_id}")
            md_lines.append(f"- **Total Terms**: {stats['total_terms']}")
            md_lines.append(f"- **Mapped Terms**: {stats['mapped_terms']}")
            md_lines.append(f"- **Success Rate**: {stats['success_rate']:.2f}%")
            md_lines.append("")
        md_lines.append("## Visualizations")
        for image_path in visualization_images:
            image_filename = os.path.basename(image_path)
            md_lines.append(f"![{image_filename}]({image_filename})")
            md_lines.append("")

        if isinstance(output_path_or_buffer, str):
            with open(output_path_or_buffer, 'w') as f:
                f.write('\n'.join(md_lines))
        else:
            output_path_or_buffer.write('\n'.join(md_lines).encode('utf-8'))
    else:
        raise ValueError("Unsupported report format. Use 'pdf' or 'md'.")


[docs]def create_visual_summary(df, phenotype_columns=None, output_image_path=None):
    """
    Creates visual summaries with extra steps to keep axis labels fully visible:
      1) Missingness Heatmap (white/blue)
      2) Bar plot of % missing per column
      3) Numeric histograms ignoring ID columns
      4) Optional bar/pie charts for phenotype columns
    """
    # Check for proper DataFrame input
    if not isinstance(df, pd.DataFrame):
        raise TypeError(
            f"create_visual_summary() expects a pandas DataFrame, but got {type(df)}."
        )

    figs = []

    # 1) Missingness visuals
    if not df.empty:
        # (a) Heatmap
        figs.append(create_missingness_heatmap(df))
        # (b) Missing distribution
        figs.append(create_missingness_distribution(df))
        # (c) Numeric histograms
        possible_ids = [c for c in df.columns if "id" in c.lower()]
        figs.extend(create_numeric_histograms(df, unique_id_cols=possible_ids))

    # 2) Phenotype-based plots
    if phenotype_columns:
        for column, ontologies in phenotype_columns.items():
            if column not in df.columns:
                continue
            non_null_values = df[column].dropna()
            if len(non_null_values) == 0:
                continue

            phenotype_counts = non_null_values.value_counts().head(20)
            fig_bar = px.bar(
                phenotype_counts,
                labels={'index': 'Phenotype Term', 'value': 'Count'},
                title=f"Top 20 Most Common Terms in {column}",
                template='plotly_white'
            )
            fig_bar.update_layout(
                plot_bgcolor="#FFFFFF",
                paper_bgcolor="#FFFFFF",
                font={'color': "#2C3E50", 'size': 12},
                title={
                    'text': f"Top 20 Most Common Terms in {column}",
                    'y': 0.97, 'x': 0.45,
                    'xanchor': 'center',
                    'yanchor': 'top',
                    'font': {'size': 16}
                },
                showlegend=False,
                width=1200,
                height=700,
                margin=dict(t=120, b=200, l=140, r=120),
                bargap=0.25
            )
            fig_bar.update_xaxes(
                tickangle=60,
                automargin=True,
                tickfont={'size': 10},
                ticktext=[
                    f"{text[:40]}..." if len(text) > 40 else text
                    for text in phenotype_counts.index
                ],
                tickvals=list(range(len(phenotype_counts))),
                showticklabels=True,
                tickmode='array'
            )
            figs.append(fig_bar)

            for onto_id in ontologies:
                mapped_col = f"{onto_id}_ID"
                if mapped_col not in df.columns:
                    continue
                valid_terms = ~df[column].isin([
                    'NotARealTerm','ZZZZ:9999999','PhenotypeJunk','InvalidTerm42'
                ])
                total = df[column].notna() & valid_terms
                total_count = total.sum()
                mapped = df[mapped_col].notna() & total
                mapped_count = mapped.sum()
                unmapped_count = total_count - mapped_count

                fig_pie = go.Figure(data=[go.Pie(
                    labels=['Mapped', 'Unmapped'],
                    values=[mapped_count, unmapped_count],
                    hole=0.4,
                    marker=dict(colors=['#4C72B0', '#DD8452']),
                    textinfo='label+percent',
                    textposition='outside',
                    textfont={'size': 14},
                    hovertemplate="<b>%{label}</b><br>Count: %{value}"
                                  "<br>Percentage: %{percent}<extra></extra>"
                )])
                fig_pie.update_layout(
                    title={
                        'text': f"Mapping Results: {column} → {onto_id}",
                        'y': 0.95,
                        'x': 0.5,
                        'xanchor': 'center',
                        'yanchor': 'top',
                        'font': {'size': 16}
                    },
                    annotations=[{
                        'text': (
                            f"Total Valid Terms: {total_count}<br>"
                            f"Mapped: {mapped_count} "
                            f"({(mapped_count / total_count * 100 if total_count else 0):.1f}%)<br>"
                            f"Unmapped: {unmapped_count} "
                            f"({(unmapped_count / total_count * 100 if total_count else 0):.1f}%)"
                        ),
                        'x': 0.5,
                        'y': -0.2,
                        'showarrow': False,
                        'font': {'size': 12}
                    }],
                    showlegend=True,
                    legend={
                        'orientation': 'h',
                        'yanchor': 'bottom',
                        'y': -0.3,
                        'xanchor': 'center',
                        'x': 0.5
                    },
                    width=900,
                    height=700,
                    plot_bgcolor="#FFFFFF",
                    paper_bgcolor="#FFFFFF",
                    font={'color': "#2C3E50"},
                    margin=dict(t=120, b=180, l=100, r=100)
                )
                figs.append(fig_pie)

    return figs

[docs]def create_missingness_distribution(df):
    """
    Returns a bar chart showing percent missingness per column.
    """
    missing_count = df.isna().sum()
    missing_percent = (missing_count / len(df)) * 100
    data = pd.DataFrame({
        "column": missing_count.index,
        "percent_missing": missing_percent
    }).sort_values("percent_missing", ascending=True)

    fig = px.bar(
        data,
        x="percent_missing",
        y="column",
        orientation="h",
        title="Percentage of Missing Data by Column",
        template="plotly_white",
        color_discrete_sequence=["#d62728"]
    )
    fig.update_layout(
        height=500,
        width=800,
        margin=dict(l=120, r=80, t=60, b=60),
        font=dict(size=12)
    )
    fig.update_xaxes(title_text="Percent Missing", automargin=True)
    fig.update_yaxes(title_text="Columns", automargin=True)
    return fig

[docs]def create_missingness_heatmap(df):
    """
    Generates a missingness heatmap with exactly two colors:
    White for present (0) and a pleasing blue (#3B82F6) for missing (1).
    """
    missing_matrix = df.isna().astype(int)
    col_order = missing_matrix.sum().sort_values(ascending=False).index
    missing_matrix = missing_matrix[col_order]

    two_color_scale = [(0.0, "white"), (1.0, "#3B82F6")]

    # Build the base heatmap
    fig = px.imshow(
        missing_matrix,
        zmin=0,
        zmax=1,
        color_continuous_scale=two_color_scale,
        labels={"color": "Missing"},
        aspect="auto",
        title="Missingness Heatmap"
    )
    # Bump the figure size
    fig.update_layout(
        height=800,
        width=1200,
        # Extra space for big labels & a lower-located chart title
        margin=dict(l=130, r=130, t=180, b=200),
        font=dict(size=12),
        xaxis=dict(side="top"),
    )
    # Move the chart title downward so it's clearly separate from x-labels
    fig.update_layout(
        title=dict(
            text="Missingness Heatmap",
            x=0.5,
            y=0.90,      # Move the title down a bit more
            xanchor="center",
            yanchor="bottom"
        )
    )
    # Increase standoff for the x-axis label
    fig.update_xaxes(
        title=dict(text="Columns", standoff=70),
        tickangle=80,  # or 90 to make them vertical
        automargin=True
    )
    # Extra standoff for y-axis label if needed
    fig.update_yaxes(
        title=dict(text="Rows", standoff=20),
        automargin=True
    )
    return fig

[docs]def create_numeric_histograms(df, unique_id_cols=None, max_cols=5):
    """
    Creates histogram figures for numeric columns, ignoring any columns
    that appear in `unique_id_cols` (if provided).
    """
    if unique_id_cols is None:
        unique_id_cols = []
    numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
    numeric_cols = [col for col in numeric_cols if col not in unique_id_cols]
    numeric_cols = numeric_cols[:max_cols]

    figs = []
    for col in numeric_cols:
        fig = px.histogram(
            df,
            x=col,
            nbins=30,
            title=f"Distribution of {col}",
            template="plotly_white",
            color_discrete_sequence=["#1f77b4"]
        )
        fig.update_layout(
            height=400,
            width=600,
            margin=dict(l=60, r=60, t=60, b=60),
            font=dict(size=12),
        )
        figs.append(fig)
    return figs