def parse_excel_to_model(excel_source: Union[str, Path, bytes, BytesIO]) -> MIHCSMEMetadata:
"""
Parse a MIHCSME Excel file into a Pydantic model.
Args:
excel_source: Path to the MIHCSME Excel file, or bytes/BytesIO of file contents
Returns:
MIHCSMEMetadata instance
Raises:
FileNotFoundError: If the Excel file doesn't exist (when path is provided)
ValueError: If required sheets are missing or malformed
"""
# Handle bytes input (e.g., from file upload)
if isinstance(excel_source, bytes):
excel_source = BytesIO(excel_source)
source_name = "<uploaded file>"
elif isinstance(excel_source, BytesIO):
source_name = "<uploaded file>"
else:
# Handle path input
filepath = Path(excel_source)
if not filepath.exists():
raise FileNotFoundError(f"Excel file not found: {filepath}")
if filepath.suffix.lower() not in [".xlsx", ".xls"]:
raise ValueError(f"File must be Excel format (.xlsx/.xls): {filepath}")
source_name = str(filepath)
excel_source = filepath
logger.info(f"Parsing MIHCSME Excel file: {source_name}")
try:
xls = pd.ExcelFile(excel_source)
available_sheets = xls.sheet_names
# Check for required sheets
required_sheets = [SHEET_INVESTIGATION, SHEET_STUDY, SHEET_ASSAY, SHEET_CONDITIONS]
missing_sheets = [s for s in required_sheets if s not in available_sheets]
if missing_sheets:
raise ValueError(f"Missing required sheets: {', '.join(missing_sheets)}")
# Parse Investigation Information
investigation_info = None
if SHEET_INVESTIGATION in available_sheets:
groups_data = _parse_key_value_sheet(xls, SHEET_INVESTIGATION)
if groups_data:
investigation_info = InvestigationInformation.from_groups_dict(groups_data)
# Parse Study Information
study_info = None
if SHEET_STUDY in available_sheets:
groups_data = _parse_key_value_sheet(xls, SHEET_STUDY)
if groups_data:
study_info = StudyInformation.from_groups_dict(groups_data)
# Parse Assay Information
assay_info = None
if SHEET_ASSAY in available_sheets:
groups_data = _parse_key_value_sheet(xls, SHEET_ASSAY)
if groups_data:
assay_info = AssayInformation.from_groups_dict(groups_data)
# Parse Assay Conditions
assay_conditions = []
if SHEET_CONDITIONS in available_sheets:
assay_conditions = _parse_assay_conditions(xls, SHEET_CONDITIONS)
# Parse Reference Sheets
reference_sheets = []
for sheet_name in available_sheets:
if sheet_name.startswith("_"):
ref_data = _parse_reference_sheet(xls, sheet_name)
if ref_data:
reference_sheets.append(ReferenceSheet(name=sheet_name, data=ref_data))
xls.close()
return MIHCSMEMetadata(
investigation_information=investigation_info,
study_information=study_info,
assay_information=assay_info,
assay_conditions=assay_conditions,
reference_sheets=reference_sheets,
)
except Exception as e:
logger.error(f"Failed to parse Excel file '{source_name}': {e}")
raise