Skip to content

reader

Module for loading various HDX-MS formats.

deduplicate_name(name)

Deduplicate column name by removing trailing '_duplicated_xx

Source code in hdxms_datasets/reader.py
 96
 97
 98
 99
100
def deduplicate_name(name: str):
    """Deduplicate column name by removing trailing '_duplicated_xx"""
    import re

    return re.sub(r"_duplicated_\d+$", "", name)

get_backend()

Returns the backend used for data handling.

Source code in hdxms_datasets/reader.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
def get_backend():
    """
    Returns the backend used for data handling.
    """
    try:
        import polars  # NOQA: F401 # type: ignore[import]

        return "polars"
    except ImportError:
        pass

    try:
        import pandas  # NOQA: F401 # type: ignore[import]

        return "pandas"
    except ImportError:
        pass

    try:
        import modin  # NOQA: F401 # type: ignore[import]

        return "modin"
    except ImportError:
        pass

    try:
        import pyarrow  # NOQA: F401 # type: ignore[import]

        return "pyarrow"
    except ImportError:
        pass

    raise ImportError("No suitable backend found. Please install pandas, polars, pyarrow or modin.")

hxms_line_generator(source)

Generate lines from an HXMS file.

Source code in hdxms_datasets/reader.py
225
226
227
228
229
230
231
def hxms_line_generator(source: Path) -> Iterator[str]:
    """
    Generate lines from an HXMS file.
    """
    with source.open("r", encoding="utf-8") as fh:
        for line in fh:
            yield line.rstrip("\r\n")

parse_hxms_lines(lines, read_content=True)

Parse the different sections of an HXMS file.

Returns a dictionary with keys
  • "HEADER": list of header lines
  • "METADATA": dict of metadata key-value pairs
  • "REMARK": dict of remark key-value pairs
  • "DATA": Narwhals DataFrame containing the HXMS data (if read_content is True)

Parameters:

Name Type Description Default
lines Iterable[str]

An iterable of lines from the HXMS file.

required

Returns:

Type Description
HXMSResult

A dictionary containing the parsed information.

Source code in hdxms_datasets/reader.py
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
def parse_hxms_lines(lines: Iterable[str], read_content: bool = True) -> HXMSResult:
    """Parse the different sections of an HXMS file.

    Returns a dictionary with keys:
        - "HEADER": list of header lines
        - "METADATA": dict of metadata key-value pairs
        - "REMARK": dict of remark key-value pairs
        - "DATA": Narwhals DataFrame containing the HXMS data (if read_content is True)

    Args:
        lines: An iterable of lines from the HXMS file.

    Returns:
        A dictionary containing the parsed information.

    """
    result: HXMSResult = {
        "HEADER": [],
        "METADATA": {},
        "REMARK": {},
    }
    columns = []
    line_iter = iter(lines)
    for line in line_iter:
        if line.startswith("HEADER"):
            content = line.lstrip("HEADER").strip()
            result["HEADER"].append(content)
        elif line.startswith("METADATA"):
            name, *raw_content = line.strip().split(" ")
            content = [item for item in raw_content if item]
            if len(content) == 2:
                key, value = content
                result["METADATA"][key] = value
        elif line.startswith("REMARK"):
            name, *raw_content = line.strip().split(" ")
            content = [item for item in raw_content if item]
            if len(content) == 2:
                key, value = content
                result["REMARK"][key] = value
        elif line.startswith("TITLE_TP") and not read_content:
            return result
        elif line.startswith("TITLE_TP") and read_content:
            columns = _line_content(line)
            break

    # the rest of the lines are data lines
    df = _parse_hxms_TP_lines(line_iter, sequence=result["METADATA"]["PROTEIN_SEQUENCE"])
    result["DATA"] = df

    # check read columns against expected columns
    if columns:
        expected_columns = list(HXMS_SCHEMA)[: len(columns)]
        if columns != expected_columns:
            warnings.warn(
                f"Columns in HXMS file do not match expected columns. "
                f"Found: {columns}, Expected: {expected_columns}"
            )

    return result

read_csv(source, **kwargs)

Read a CSV file and return a Narwhals DataFrame.

Parameters:

Name Type Description Default
source Path | StringIO | bytes

Source object representing the CSV data.

required

Returns:

Type Description
DataFrame

A Narwhals DataFrame containing the CSV data.

Source code in hdxms_datasets/reader.py
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
def read_csv(source: Path | StringIO | bytes, **kwargs) -> nw.DataFrame:
    """
    Read a CSV file and return a Narwhals DataFrame.

    Args:
        source: Source object representing the CSV data.

    Returns:
        A Narwhals DataFrame containing the CSV data.

    """

    if isinstance(source, Path):
        return nw.read_csv(source.as_posix(), backend=BACKEND, **kwargs)
    elif isinstance(source, bytes):
        import polars as pl

        return nw.from_native(pl.read_csv(source), **kwargs)
    elif isinstance(source, StringIO):
        try:
            import polars as pl

            return nw.from_native(pl.read_csv(source), **kwargs)
        except ImportError:
            pass
        try:
            import pandas as pd

            return nw.from_native(pd.read_csv(source), **kwargs)  # type: ignore
        except ImportError:
            raise ValueError("No suitable backend found for reading file-like objects")
    else:
        raise TypeError(f"Source must be a Path, bytes, or file-like object, got: {type(source)}")

read_hdexaminer_peptide_pool(source)

Read an HDX-Examiner peptide pool file and return a Narwhals DataFrame.

Parameters:

Name Type Description Default
source Path | StringIO

Source object representing the HDX-Examiner peptide pool data.

required
Source code in hdxms_datasets/reader.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
def read_hdexaminer_peptide_pool(source: Path | StringIO) -> nw.DataFrame:
    """
    Read an HDX-Examiner peptide pool file and return a Narwhals DataFrame.

    Args:
        source: Source object representing the HDX-Examiner peptide pool data.

    """

    # read the data and header
    if isinstance(source, StringIO):
        try:
            import polars as pl

            df = nw.from_native(pl.read_csv(source, skip_rows=1, has_header=True))
        except ImportError:
            import pandas as pd

            df = nw.from_native(pd.read_csv(source, skiprows=[0]))

        source.seek(0)
        exposure_line = source.readline()
        header_line = source.readline()

    else:
        kwargs = {
            "polars": {"skip_rows": 1, "has_header": True},
            "pandas": {"skiprows": [0]},
        }
        if BACKEND not in kwargs:
            raise ValueError(f"Unsupported backend: {BACKEND}")
        df = nw.read_csv(source.as_posix(), backend=BACKEND, **kwargs[BACKEND])
        with open(source, "r") as fh:
            exposure_line = fh.readline()
            header_line = fh.readline()

    exposure_columns = exposure_line.strip().split(",")
    header_columns = header_line.strip().split(",")

    found_schema = df[:, 0:8].schema
    if found_schema != HDEXAMINER_PEPTIDE_POOL_INITIAL_SCHEMA:
        raise ValueError("HDX-Examiner peptide pool file has an unexpected columns schema.")

    # find indices of exposure markers in header
    has_entry_with_end = [i for i, col in enumerate(exposure_columns) if col] + [
        len(exposure_columns)
    ]

    num_blocks = len(has_entry_with_end) - 1
    initial_df = nw.concat([df[:, :8]] * num_blocks, how="vertical")

    has_entry_with_end = [i for i, col in enumerate(exposure_columns) if col] + [
        len(exposure_columns)
    ]

    output = defaultdict(list)
    for i, j in zip(has_entry_with_end[:-1], has_entry_with_end[1:]):
        exposure = exposure_columns[i]
        found_columns = header_columns[i:j]

        # iterate over the expected columns, extract the series and append to output
        # for missing columns, create a series of nulls
        for col, dtype in HDEXAMINER_PEPTIDE_POOL_REPEATED_SCHEMA.items():
            if col in found_columns:
                column_index = found_columns.index(col) + i
                frame = df[:, column_index].cast(dtype).alias(col).to_frame()
            else:
                c = itertools.repeat(None, len(df))
                frame = nw.Series.from_iterable(
                    name=col, values=c, dtype=dtype, backend=BACKEND
                ).to_frame()

            output[col].append(frame)

        c = itertools.repeat(exposure, len(df))
        frame = nw.Series.from_iterable(
            name="Exposure", values=c, dtype=dtype, backend=BACKEND
        ).to_frame()
        output["Exposure"].append(frame)

    # combine all 1-column frames first vertically and then horizontally with initial_df
    concatenated = {k: nw.concat(v, how="vertical") for k, v in output.items()}
    final_output = nw.concat([initial_df, *concatenated.values()], how="horizontal")

    return final_output

read_hxms(source, returns='HXMSResult')

read_hxms(
    source: Path | IO | bytes,
    returns: Literal["HXMSResult"],
) -> HXMSResult
read_hxms(
    source: Path | IO | bytes, returns: Literal["DataFrame"]
) -> nw.DataFrame

Read an HXMS file and return a HXMSResult or Narwhals DataFrame.

Parameters:

Name Type Description Default
source Path | IO | bytes

Source object representing the HXMS data.

required

Returns:

Type Description
HXMSResult | DataFrame

A Narwhals DataFrame containing the HXMS data.

Source code in hdxms_datasets/reader.py
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
def read_hxms(
    source: Path | IO | bytes,
    returns: Literal["HXMSResult", "DataFrame"] = "HXMSResult",
) -> HXMSResult | nw.DataFrame:
    """
    Read an HXMS file and return a HXMSResult or Narwhals DataFrame.

    Args:
        source: Source object representing the HXMS data.

    Returns:
        A Narwhals DataFrame containing the HXMS data.
    """

    lines = _hxms_splitlines(source)
    # TODO make generator based on input type
    line_gen = iter(lines)

    # first get column names
    result = parse_hxms_lines(line_gen, read_content=True)

    if returns == "HXMSResult":
        return result
    elif returns == "DataFrame":
        assert "DATA" in result, "No data found in HXMS file"
        return result["DATA"]
    else:
        raise ValueError(f"Unsupported returns value: {returns!r}")