Skip to content

loader

Module for loading various HDX-MS formats.

get_backend()

Returns the backend used for data handling.

Source code in hdxms_datasets/loader.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
def get_backend():
    """
    Returns the backend used for data handling.
    """
    try:
        import polars  # NOQA: F401 # type: ignore[import]

        return "polars"
    except ImportError:
        pass

    try:
        import pandas  # NOQA: F401 # type: ignore[import]

        return "pandas"
    except ImportError:
        pass

    try:
        import modin  # NOQA: F401 # type: ignore[import]

        return "modin"
    except ImportError:
        pass

    try:
        import pyarrow  # NOQA: F401 # type: ignore[import]

        return "pyarrow"
    except ImportError:
        pass

    raise ImportError("No suitable backend found. Please install pandas, polars, pyarrow or modin.")

hxms_line_generator(source)

Generate lines from an HXMS file.

Source code in hdxms_datasets/loader.py
102
103
104
105
106
107
108
def hxms_line_generator(source: Path) -> Iterator[str]:
    """
    Generate lines from an HXMS file.
    """
    with source.open("r", encoding="utf-8") as fh:
        for line in fh:
            yield line.rstrip("\r\n")

load_data(data_file)

Load data from the specified file and return a Narwhals DataFrame.

Parameters:

Name Type Description Default
data_file Path

Path to the data file.

required

Returns:

Type Description
DataFrame

A Narwhals DataFrame containing the loaded data.

Source code in hdxms_datasets/loader.py
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
def load_data(data_file: Path) -> nw.DataFrame:
    """
    Load data from the specified file and return a Narwhals DataFrame.

    Args:
        data_file: Path to the data file.

    Returns:
        A Narwhals DataFrame containing the loaded data.

    """

    if data_file.suffix.lower() == ".csv":
        df = read_csv(data_file)
    elif data_file.suffix.lower() == ".hxms":
        result = read_hxms(data_file)
        assert "DATA" in result, "No data found in HXMS file"
        df = result["DATA"]
    else:
        raise ValueError(f"Unsupported file format: {data_file.suffix}")

    return df

load_peptides(peptides, base_dir=Path.cwd(), convert=True, aggregate=None, sort_rows=True, sort_columns=True, drop_null=True)

Load peptides from the data file and return a Narwhals DataFrame.

Parameters:

Name Type Description Default
peptides Peptides

Peptides object containing metadata and file path.

required
base_dir Path

Base directory to resolve relative file paths. Defaults to the current working directory.

cwd()
convert bool

Whether to convert the data to a standard format.

True
aggregate bool | None

Whether to aggregate the data. If None, will aggregate if the data is not already aggregated.

None
sort_rows bool

Whether to sort the rows.

True
sort_columns bool

Whether to sort the columns in a standard order.

True
drop_null bool

Whether to drop columns that are entirely null.

True

Returns:

Type Description
DataFrame

A Narwhals DataFrame containing the loaded peptide data.

Source code in hdxms_datasets/loader.py
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
def load_peptides(
    peptides: Peptides,
    base_dir: Path = Path.cwd(),
    convert: bool = True,
    aggregate: bool | None = None,
    sort_rows: bool = True,
    sort_columns: bool = True,
    drop_null: bool = True,
) -> nw.DataFrame:
    """
    Load peptides from the data file and return a Narwhals DataFrame.

    Args:
        peptides: Peptides object containing metadata and file path.
        base_dir: Base directory to resolve relative file paths. Defaults to the current working directory.
        convert: Whether to convert the data to a standard format.
        aggregate: Whether to aggregate the data. If None, will aggregate if the data is not already aggregated.
        sort_rows: Whether to sort the rows.
        sort_columns: Whether to sort the columns in a standard order.
        drop_null: Whether to drop columns that are entirely null.

    Returns:
        A Narwhals DataFrame containing the loaded peptide data.

    """

    # Resolve the data file path
    if peptides.data_file.is_absolute():
        data_path = peptides.data_file
    else:
        data_path = base_dir / peptides.data_file

    # Load the raw data
    df = load_data(data_path)

    from hdxms_datasets import process

    df = process.apply_filters(df, **peptides.filters)

    format_spec = FORMAT_LUT.get(peptides.data_format)
    assert format_spec is not None, f"Unknown format: {peptides.data_format}"

    if callable(format_spec.aggregated):
        is_aggregated = format_spec.aggregated(df)
    else:
        is_aggregated = format_spec.aggregated

    # if aggregation is not specified, by default aggregate if the data is not already aggregated
    if aggregate is None:
        aggregate = not is_aggregated

    if aggregate and is_aggregated:
        warnings.warn("Data format is pre-aggregated. Aggregation will be skipped.")
        aggregate = False

    if not convert and aggregate:
        warnings.warn("Cannot aggregate data without conversion. Aggeregation will be skipped.")
        aggregate = False

    if not convert and sort_rows:
        warnings.warn("Cannot sort rows without conversion. Sorting will be skipped.")
        sort_rows = False

    if not convert and sort_columns:
        warnings.warn("Cannot sort columns without conversion. Sorting will be skipped.")
        sort_columns = False

    if convert:
        df = format_spec.convert(df)

    if aggregate:
        df = process.aggregate(df)

    if drop_null:
        df = process.drop_null_columns(df)

    if sort_rows:
        df = process.sort_rows(df)

    if sort_columns:
        df = process.sort_columns(df)

    return df

parse_hxms_lines(lines, read_content=True)

Parse the different sections of an HXMS file.

Returns a dictionary with keys
  • "HEADER": list of header lines
  • "METADATA": dict of metadata key-value pairs
  • "REMARK": dict of remark key-value pairs
  • "DATA": Narwhals DataFrame containing the HXMS data (if read_content is True)

Parameters:

Name Type Description Default
lines Iterable[str]

An iterable of lines from the HXMS file.

required

Returns:

Type Description
HXMSResult

A dictionary containing the parsed information.

Source code in hdxms_datasets/loader.py
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
def parse_hxms_lines(lines: Iterable[str], read_content: bool = True) -> HXMSResult:
    """Parse the different sections of an HXMS file.

    Returns a dictionary with keys:
        - "HEADER": list of header lines
        - "METADATA": dict of metadata key-value pairs
        - "REMARK": dict of remark key-value pairs
        - "DATA": Narwhals DataFrame containing the HXMS data (if read_content is True)

    Args:
        lines: An iterable of lines from the HXMS file.

    Returns:
        A dictionary containing the parsed information.

    """
    result: HXMSResult = {
        "HEADER": [],
        "METADATA": {},
        "REMARK": {},
    }
    columns = []
    line_iter = iter(lines)
    for line in line_iter:
        if line.startswith("HEADER"):
            content = line.lstrip("HEADER").strip()
            result["HEADER"].append(content)
        elif line.startswith("METADATA"):
            name, *raw_content = line.strip().split(" ")
            content = [item for item in raw_content if item]
            if len(content) == 2:
                key, value = content
                result["METADATA"][key] = value
        elif line.startswith("REMARK"):
            name, *raw_content = line.strip().split(" ")
            content = [item for item in raw_content if item]
            if len(content) == 2:
                key, value = content
                result["REMARK"][key] = value
        elif line.startswith("TITLE_TP") and not read_content:
            return result
        elif line.startswith("TITLE_TP") and read_content:
            columns = _line_content(line)
            break

    # the rest of the lines are data lines
    df = _parse_hxms_TP_lines(line_iter, sequence=result["METADATA"]["PROTEIN_SEQUENCE"])
    result["DATA"] = df

    # check read columns against expected columns
    if columns:
        expected_columns = list(HXMS_DTYPES)[: len(columns)]
        if columns != expected_columns:
            warnings.warn(
                f"Columns in HXMS file do not match expected columns. "
                f"Found: {columns}, Expected: {expected_columns}"
            )

    return result

read_csv(source)

Read a CSV file and return a Narwhals DataFrame.

Parameters:

Name Type Description Default
source Path | IO | bytes

Source object representing the CSV data.

required

Returns:

Type Description
DataFrame

A Narwhals DataFrame containing the CSV data.

Source code in hdxms_datasets/loader.py
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def read_csv(source: Path | IO | bytes) -> nw.DataFrame:
    """
    Read a CSV file and return a Narwhals DataFrame.

    Args:
        source: Source object representing the CSV data.

    Returns:
        A Narwhals DataFrame containing the CSV data.

    """

    if isinstance(source, Path):
        return nw.read_csv(source.as_posix(), backend=BACKEND)
    elif isinstance(source, bytes):
        import polars as pl

        return nw.from_native(pl.read_csv(source))
    elif isinstance(source, IO):
        try:
            import polars as pl

            return nw.from_native(pl.read_csv(source))
        except ImportError:
            pass
        try:
            import pandas as pd

            return nw.from_native(pd.read_csv(source))  # type: ignore
        except ImportError:
            raise ValueError("No suitable backend found for reading file-like objects")
    else:
        raise TypeError("source must be a Path, bytes, or file-like object")

read_hxms(source)

Read an HXMS file and return a Narwhals DataFrame.

Parameters:

Name Type Description Default
source Path | IO | bytes

Source object representing the HXMS data.

required

Returns:

Type Description
HXMSResult

A Narwhals DataFrame containing the HXMS data.

Source code in hdxms_datasets/loader.py
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
def read_hxms(source: Path | IO | bytes) -> HXMSResult:
    """
    Read an HXMS file and return a Narwhals DataFrame.

    Args:
        source: Source object representing the HXMS data.

    Returns:
        A Narwhals DataFrame containing the HXMS data.
    """

    lines = _hxms_splitlines(source)
    # TODO make generator based on input type
    line_gen = iter(lines)

    # first get column names
    result = parse_hxms_lines(line_gen, read_content=True)

    return result