Skip to content

formats

FormatSpec dataclass

Specification for a data format

Parameters:

Name Type Description Default
name str

Name of the format.

required
required_columns list[str]

List of columns required to identify this format.

required
filter_columns list[str]

List of columns that can be used to filter data.

required
converter Callable[[DataFrame], DataFrame]

Function to convert a DataFrame to OpenHDX format.

required
aggregated bool | Callable[[DataFrame], bool]

Whether the format is aggregated, or a function to determine if a DataFrame is aggregated.

required
Source code in hdxms_datasets/formats.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
@dataclass(frozen=True)
class FormatSpec:
    """Specification for a data format

    Args:
        name: Name of the format.
        required_columns: List of columns required to identify this format.
        filter_columns: List of columns that can be used to filter data.
        converter: Function to convert a DataFrame to OpenHDX format.
        aggregated: Whether the format is aggregated, or a function to determine if a DataFrame is aggregated.

    """

    name: str
    required_columns: list[str]
    filter_columns: list[str]
    converter: Callable[[nw.DataFrame], nw.DataFrame]
    aggregated: bool | Callable[[nw.DataFrame], bool]

    def matches(self, df: nw.DataFrame) -> bool:
        """Check if a DataFrame matches this format."""
        df_cols = set(df.columns)
        required_cols = set(self.required_columns)
        return required_cols.issubset(df_cols)

    def convert(self, df: nw.DataFrame) -> nw.DataFrame:
        """Convert DataFrame to OpenHDX format."""
        return self.converter(df)

    def is_aggregated(self, df: nw.DataFrame | None = None) -> bool:
        """Check if a DataFrame is aggregated."""
        if self.aggregated is True:
            return True
        if callable(self.aggregated):
            if df is None:
                raise ValueError("DataFrame must be provided to check aggregation")
            return self.aggregated(df)
        return False

convert(df)

Convert DataFrame to OpenHDX format.

Source code in hdxms_datasets/formats.py
69
70
71
def convert(self, df: nw.DataFrame) -> nw.DataFrame:
    """Convert DataFrame to OpenHDX format."""
    return self.converter(df)

is_aggregated(df=None)

Check if a DataFrame is aggregated.

Source code in hdxms_datasets/formats.py
73
74
75
76
77
78
79
80
81
def is_aggregated(self, df: nw.DataFrame | None = None) -> bool:
    """Check if a DataFrame is aggregated."""
    if self.aggregated is True:
        return True
    if callable(self.aggregated):
        if df is None:
            raise ValueError("DataFrame must be provided to check aggregation")
        return self.aggregated(df)
    return False

matches(df)

Check if a DataFrame matches this format.

Source code in hdxms_datasets/formats.py
63
64
65
66
67
def matches(self, df: nw.DataFrame) -> bool:
    """Check if a DataFrame matches this format."""
    df_cols = set(df.columns)
    required_cols = set(self.required_columns)
    return required_cols.issubset(df_cols)

identify_format(df)

Identify format from DataFrame columns

Source code in hdxms_datasets/formats.py
199
200
201
202
203
204
def identify_format(df: nw.DataFrame) -> Optional[FormatSpec]:
    """Identify format from DataFrame columns"""
    for fmt in FORMATS:
        if fmt.matches(df):
            return fmt
    return None