Skip to content

formats

FormatSpec dataclass

Specification for a data format

Source code in hdxms_datasets/formats.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
@dataclass(frozen=True)
class FormatSpec:
    """Specification for a data format"""

    name: str
    required_columns: list[str]
    filter_columns: list[str]
    converter: Callable[[nw.DataFrame], nw.DataFrame]
    aggregated: bool | Callable[[nw.DataFrame], bool]

    def matches(self, df: nw.DataFrame) -> bool:
        """Check if DataFrame matches this format"""
        df_cols = set(df.columns)
        required_cols = set(self.required_columns)
        return required_cols.issubset(df_cols)

    def convert(self, df: nw.DataFrame) -> nw.DataFrame:
        """Convert DataFrame to OpenHDX format"""
        return self.converter(df)

    def is_aggregated(self, df: nw.DataFrame | None = None) -> bool:
        """Check if DataFrame is aggregated"""
        if self.aggregated is True:
            return True
        if callable(self.aggregated):
            if df is None:
                raise ValueError("DataFrame must be provided to check aggregation")
            return self.aggregated(df)
        return False

convert(df)

Convert DataFrame to OpenHDX format

Source code in hdxms_datasets/formats.py
60
61
62
def convert(self, df: nw.DataFrame) -> nw.DataFrame:
    """Convert DataFrame to OpenHDX format"""
    return self.converter(df)

is_aggregated(df=None)

Check if DataFrame is aggregated

Source code in hdxms_datasets/formats.py
64
65
66
67
68
69
70
71
72
def is_aggregated(self, df: nw.DataFrame | None = None) -> bool:
    """Check if DataFrame is aggregated"""
    if self.aggregated is True:
        return True
    if callable(self.aggregated):
        if df is None:
            raise ValueError("DataFrame must be provided to check aggregation")
        return self.aggregated(df)
    return False

matches(df)

Check if DataFrame matches this format

Source code in hdxms_datasets/formats.py
54
55
56
57
58
def matches(self, df: nw.DataFrame) -> bool:
    """Check if DataFrame matches this format"""
    df_cols = set(df.columns)
    required_cols = set(self.required_columns)
    return required_cols.issubset(df_cols)

identify_format(df)

Identify format from DataFrame columns

Source code in hdxms_datasets/formats.py
190
191
192
193
194
195
def identify_format(df: nw.DataFrame) -> Optional[FormatSpec]:
    """Identify format from DataFrame columns"""
    for fmt in FORMATS:
        if fmt.matches(df):
            return fmt
    return None