Source code for preclink.inspect.report
"""Inspection report generation."""
from dataclasses import dataclass, field
import pandas as pd
from preclink.inspect.diagnostics import LinkageDiagnostics
[docs]
@dataclass
class InspectionReport:
"""Detailed inspection report for linkage results.
Args:
diagnostics: Linkage diagnostics.
ambiguous_pairs: Pairs with close scores that may be ambiguous.
unmatched_left: Left records that were not matched.
unmatched_right: Right records that were not matched.
"""
diagnostics: LinkageDiagnostics
ambiguous_pairs: pd.DataFrame = field(default_factory=pd.DataFrame)
unmatched_left: pd.DataFrame = field(default_factory=pd.DataFrame)
unmatched_right: pd.DataFrame = field(default_factory=pd.DataFrame)
[docs]
def summary(self) -> str:
"""Generate a text summary of the report.
Returns:
Human-readable summary string.
"""
lines = [
"Linkage Report",
"=" * 40,
f"Left records: {self.diagnostics.n_left:,}",
f"Right records: {self.diagnostics.n_right:,}",
f"Candidate pairs: {self.diagnostics.n_candidate_pairs:,}",
f"Filtered pairs: {self.diagnostics.n_filtered_pairs:,}",
f"Final matches: {self.diagnostics.n_matches:,}",
f"Left match rate: {self.diagnostics.match_rate_left:.1%}",
f"Right match rate: {self.diagnostics.match_rate_right:.1%}",
]
if self.diagnostics.score_stats:
lines.extend(
[
"",
"Score Distribution",
"-" * 20,
f" Min: {self.diagnostics.score_stats['min']:.3f}",
f" Max: {self.diagnostics.score_stats['max']:.3f}",
f" Mean: {self.diagnostics.score_stats['mean']:.3f}",
f" Median: {self.diagnostics.score_stats['median']:.3f}",
]
)
if not self.ambiguous_pairs.empty:
lines.extend(
[
"",
f"Ambiguous pairs: {len(self.ambiguous_pairs):,}",
]
)
return "\n".join(lines)
[docs]
def generate_report(
left: pd.DataFrame,
right: pd.DataFrame,
matches: pd.DataFrame,
diagnostics: LinkageDiagnostics,
filtered_pairs: pd.DataFrame,
margin_threshold: float = 0.1,
) -> InspectionReport:
"""Generate an inspection report for linkage results.
Args:
left: Left DataFrame.
right: Right DataFrame.
matches: Final matches.
diagnostics: Linkage diagnostics.
filtered_pairs: Pairs after filtering.
margin_threshold: Threshold for identifying ambiguous pairs.
Returns:
InspectionReport with detailed analysis.
"""
matched_left_indices = set(matches["left_index"]) if not matches.empty else set()
matched_right_indices = set(matches["right_index"]) if not matches.empty else set()
unmatched_left = left.loc[~left.index.isin(matched_left_indices)].copy()
unmatched_right = right.loc[~right.index.isin(matched_right_indices)].copy()
ambiguous = pd.DataFrame()
if not filtered_pairs.empty and "score" in filtered_pairs.columns:
ambiguous_indices = []
for left_idx in filtered_pairs["left_index"].unique():
left_pairs = filtered_pairs[filtered_pairs["left_index"] == left_idx]
if len(left_pairs) >= 2:
scores = left_pairs["score"].sort_values(ascending=False)
if scores.iloc[0] - scores.iloc[1] < margin_threshold:
ambiguous_indices.extend(left_pairs.index.tolist())
if ambiguous_indices:
ambiguous = filtered_pairs.loc[ambiguous_indices].copy()
return InspectionReport(
diagnostics=diagnostics,
ambiguous_pairs=ambiguous,
unmatched_left=unmatched_left,
unmatched_right=unmatched_right,
)