Skip to content

loader

get_backend()

Returns the backend used for data handling.

Source code in hdxms_datasets/loader.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def get_backend():
    """
    Returns the backend used for data handling.
    """
    try:
        import polars  # NOQA: F401 # type: ignore[import]

        return "polars"
    except ImportError:
        pass

    try:
        import pandas  # NOQA: F401 # type: ignore[import]

        return "pandas"
    except ImportError:
        pass

    try:
        import modin  # NOQA: F401 # type: ignore[import]

        return "modin"
    except ImportError:
        pass

    try:
        import pyarrow  # NOQA: F401 # type: ignore[import]

        return "pyarrow"
    except ImportError:
        pass

    raise ImportError("No suitable backend found. Please install pandas, polars, pyarrow or modin.")

load_peptides(peptides, base_dir=Path.cwd(), convert=True, aggregate=None, sort_rows=True, sort_columns=True, drop_null=True)

Load peptides from the data file and return a Narwhals DataFrame.

Parameters:

Name Type Description Default
peptides Peptides

Peptides object containing metadata and file path.

required
base_dir Path

Base directory to resolve relative file paths. Defaults to the current working directory.

cwd()
convert bool

Whether to convert the data to a standard format.

True
aggregate bool | None

Whether to aggregate the data. If None, will aggregate if the data is not already aggregated.

None
sort_rows bool

Whether to sort the rows.

True
sort_columns bool

Whether to sort the columns in a standard order.

True
drop_null bool

Whether to drop columns that are entirely null.

True

Returns:

Type Description
DataFrame

A Narwhals DataFrame containing the loaded peptide data.

Source code in hdxms_datasets/loader.py
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
def load_peptides(
    peptides: Peptides,
    base_dir: Path = Path.cwd(),
    convert: bool = True,
    aggregate: bool | None = None,
    sort_rows: bool = True,
    sort_columns: bool = True,
    drop_null: bool = True,
) -> nw.DataFrame:
    """
    Load peptides from the data file and return a Narwhals DataFrame.

    Args:
        peptides: Peptides object containing metadata and file path.
        base_dir: Base directory to resolve relative file paths. Defaults to the current working directory.
        convert: Whether to convert the data to a standard format.
        aggregate: Whether to aggregate the data. If None, will aggregate if the data is not already aggregated.
        sort_rows: Whether to sort the rows.
        sort_columns: Whether to sort the columns in a standard order.
        drop_null: Whether to drop columns that are entirely null.

    Returns:
        A Narwhals DataFrame containing the loaded peptide data.

    """

    # Resolve the data file path
    if peptides.data_file.is_absolute():
        data_path = peptides.data_file
    else:
        data_path = base_dir / peptides.data_file

    # Load the raw data
    df = read_csv(data_path)

    from hdxms_datasets import process

    df = process.apply_filters(df, **peptides.filters)

    format_spec = FORMAT_LUT.get(peptides.data_format)
    assert format_spec is not None, f"Unknown format: {peptides.data_format}"

    if callable(format_spec.aggregated):
        is_aggregated = format_spec.aggregated(df)
    else:
        is_aggregated = format_spec.aggregated

    # if aggregation is not specified, by default aggregate if the data is not already aggregated
    if aggregate is None:
        aggregate = not is_aggregated

    if aggregate and is_aggregated:
        warnings.warn("Data format is pre-aggregated. Aggregation will be skipped.")
        aggregate = False

    if not convert and aggregate:
        warnings.warn("Cannot aggregate data without conversion. Aggeregation will be skipped.")
        aggregate = False

    if not convert and sort_rows:
        warnings.warn("Cannot sort rows without conversion. Sorting will be skipped.")
        sort_rows = False

    if not convert and sort_columns:
        warnings.warn("Cannot sort columns without conversion. Sorting will be skipped.")
        sort_columns = False

    if convert:
        df = format_spec.convert(df)

    if aggregate:
        df = process.aggregate(df)

    if drop_null:
        df = process.drop_null_columns(df)

    if sort_rows:
        df = process.sort_rows(df)

    if sort_columns:
        df = process.sort_columns(df)

    return df

read_csv(source)

Read a CSV file and return a Narwhals DataFrame.

Parameters:

Name Type Description Default
source Path | str | IO | bytes

Source object representing the CSV data.

required

Returns:

Type Description
DataFrame

A Narwhals DataFrame containing the CSV data.

Source code in hdxms_datasets/loader.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def read_csv(source: Path | str | IO | bytes) -> nw.DataFrame:
    """
    Read a CSV file and return a Narwhals DataFrame.

    Args:
        source: Source object representing the CSV data.

    Returns:
        A Narwhals DataFrame containing the CSV data.

    """
    if isinstance(source, str):
        return nw.read_csv(source, backend=BACKEND)
    elif isinstance(source, Path):
        return nw.read_csv(source.as_posix(), backend=BACKEND)
    elif isinstance(source, bytes):
        import polars as pl

        return nw.from_native(pl.read_csv(source))
    else:
        try:
            import polars as pl

            return nw.from_native(pl.read_csv(source))
        except ImportError:
            pass
        try:
            import pandas as pd

            return nw.from_native(pd.read_csv(source))  # type: ignore
        except ImportError:
            raise ValueError("No suitable backend found for reading file-like objects or bytes.")