Skip to content

mihcsme_py.parser#

mihcsme_py.parser #

Parse MIHCSME Excel files into Pydantic models.

parse_excel_to_model(excel_source: Union[str, Path, bytes, BytesIO]) -> MIHCSMEMetadata #

Parse a MIHCSME Excel file into a Pydantic model.

Parameters:

Name Type Description Default
excel_source Union[str, Path, bytes, BytesIO]

Path to the MIHCSME Excel file, or bytes/BytesIO of file contents

required

Returns:

Type Description
MIHCSMEMetadata

MIHCSMEMetadata instance

Raises:

Type Description
FileNotFoundError

If the Excel file doesn't exist (when path is provided)

ValueError

If required sheets are missing or malformed

Source code in src/mihcsme_py/parser.py
def parse_excel_to_model(excel_source: Union[str, Path, bytes, BytesIO]) -> MIHCSMEMetadata:
    """
    Parse a MIHCSME Excel file into a Pydantic model.

    Args:
        excel_source: Path to the MIHCSME Excel file, or bytes/BytesIO of file contents

    Returns:
        MIHCSMEMetadata instance

    Raises:
        FileNotFoundError: If the Excel file doesn't exist (when path is provided)
        ValueError: If required sheets are missing or malformed
    """
    # Handle bytes input (e.g., from file upload)
    if isinstance(excel_source, bytes):
        excel_source = BytesIO(excel_source)
        source_name = "<uploaded file>"
    elif isinstance(excel_source, BytesIO):
        source_name = "<uploaded file>"
    else:
        # Handle path input
        filepath = Path(excel_source)

        if not filepath.exists():
            raise FileNotFoundError(f"Excel file not found: {filepath}")

        if filepath.suffix.lower() not in [".xlsx", ".xls"]:
            raise ValueError(f"File must be Excel format (.xlsx/.xls): {filepath}")

        source_name = str(filepath)
        excel_source = filepath

    logger.info(f"Parsing MIHCSME Excel file: {source_name}")

    try:
        xls = pd.ExcelFile(excel_source)
        available_sheets = xls.sheet_names

        # Check for required sheets
        required_sheets = [SHEET_INVESTIGATION, SHEET_STUDY, SHEET_ASSAY, SHEET_CONDITIONS]
        missing_sheets = [s for s in required_sheets if s not in available_sheets]
        if missing_sheets:
            raise ValueError(f"Missing required sheets: {', '.join(missing_sheets)}")

        # Parse Investigation Information
        investigation_info = None
        if SHEET_INVESTIGATION in available_sheets:
            groups_data = _parse_key_value_sheet(xls, SHEET_INVESTIGATION)
            if groups_data:
                investigation_info = InvestigationInformation.from_groups_dict(groups_data)

        # Parse Study Information
        study_info = None
        if SHEET_STUDY in available_sheets:
            groups_data = _parse_key_value_sheet(xls, SHEET_STUDY)
            if groups_data:
                study_info = StudyInformation.from_groups_dict(groups_data)

        # Parse Assay Information
        assay_info = None
        if SHEET_ASSAY in available_sheets:
            groups_data = _parse_key_value_sheet(xls, SHEET_ASSAY)
            if groups_data:
                assay_info = AssayInformation.from_groups_dict(groups_data)

        # Parse Assay Conditions
        assay_conditions = []
        if SHEET_CONDITIONS in available_sheets:
            assay_conditions = _parse_assay_conditions(xls, SHEET_CONDITIONS)

        # Parse Reference Sheets
        reference_sheets = []
        for sheet_name in available_sheets:
            if sheet_name.startswith("_"):
                ref_data = _parse_reference_sheet(xls, sheet_name)
                if ref_data:
                    reference_sheets.append(ReferenceSheet(name=sheet_name, data=ref_data))

        xls.close()

        return MIHCSMEMetadata(
            investigation_information=investigation_info,
            study_information=study_info,
            assay_information=assay_info,
            assay_conditions=assay_conditions,
            reference_sheets=reference_sheets,
        )

    except Exception as e:
        logger.error(f"Failed to parse Excel file '{source_name}': {e}")
        raise