Skip to content

formats

FormatSpec

Bases: ABC

Specification for a HDX data format

Parameters:

Name Type Description Default
name

Name of the format.

required
returned_columns

List of columns returned by .read(). May return
additional columns depending on the format.

required
filter_columns

List of columns that can be used to filter data.

required
is_valid_file

Function to check if a file is valid for this format.

required
reader

Function to read a file into a DataFrame.

required
converter

Function to convert a DataFrame to OpenHDX format.

required
aggregated

Whether the format is aggregated, not aggregated, or None if
it depends on the data.

required
doc

Optional documentation string.

required
Source code in hdxms_datasets/formats.py
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
class FormatSpec(ABC):
    """Specification for a HDX data format

    Args:
        name: Name of the format.
        returned_columns: List of columns returned by .read(). May return
            additional columns depending on the format.
        filter_columns: List of columns that can be used to filter data.
        is_valid_file: Function to check if a file is valid for this format.
        reader: Function to read a file into a DataFrame.
        converter: Function to convert a DataFrame to OpenHDX format.
        aggregated: Whether the format is aggregated, not aggregated, or None if
            it depends on the data.
        doc: Optional documentation string.

    """

    doc: str = ""
    returned_columns: list[str]
    filter_columns: list[str] = []
    aggregated: bool | None = None

    def __init_subclass__(cls):
        """Register format in global registry."""

        required_class_attrs = ["returned_columns"]
        for attr in required_class_attrs:
            if not hasattr(cls, attr):
                raise NotImplementedError(
                    f"Class attribute '{attr}' must be defined in subclass '{cls.__name__}'"
                )

        if cls.__name__ in FMT_REGISTRY:
            warnings.warn((f"Format '{cls.__name__}' is already registered. Overwriting."))
        FMT_REGISTRY[cls.__name__] = cls

    @classmethod
    @abstractmethod
    def read(cls, path: Path) -> nw.DataFrame:
        """Read the data to a dataframe using the format's reader."""

    @classmethod
    @abstractmethod
    def convert(cls, df: nw.DataFrame) -> nw.DataFrame:
        """Convert DataFrame to OpenHDX format."""

    @classmethod
    @abstractmethod
    def valid_file(cls, path: Path) -> bool:
        """Default format identification based on file extension."""

    @classmethod
    @abstractmethod
    def load(cls, path: Path, convert=True) -> nw.DataFrame:
        """Load and convert a file to OpenHDX format."""
        df = cls.read(path)
        if convert:
            df = cls.convert(df)
        return df

__init_subclass__()

Register format in global registry.

Source code in hdxms_datasets/formats.py
77
78
79
80
81
82
83
84
85
86
87
88
89
def __init_subclass__(cls):
    """Register format in global registry."""

    required_class_attrs = ["returned_columns"]
    for attr in required_class_attrs:
        if not hasattr(cls, attr):
            raise NotImplementedError(
                f"Class attribute '{attr}' must be defined in subclass '{cls.__name__}'"
            )

    if cls.__name__ in FMT_REGISTRY:
        warnings.warn((f"Format '{cls.__name__}' is already registered. Overwriting."))
    FMT_REGISTRY[cls.__name__] = cls

convert(df) abstractmethod classmethod

Convert DataFrame to OpenHDX format.

Source code in hdxms_datasets/formats.py
96
97
98
99
@classmethod
@abstractmethod
def convert(cls, df: nw.DataFrame) -> nw.DataFrame:
    """Convert DataFrame to OpenHDX format."""

load(path, convert=True) abstractmethod classmethod

Load and convert a file to OpenHDX format.

Source code in hdxms_datasets/formats.py
106
107
108
109
110
111
112
113
@classmethod
@abstractmethod
def load(cls, path: Path, convert=True) -> nw.DataFrame:
    """Load and convert a file to OpenHDX format."""
    df = cls.read(path)
    if convert:
        df = cls.convert(df)
    return df

read(path) abstractmethod classmethod

Read the data to a dataframe using the format's reader.

Source code in hdxms_datasets/formats.py
91
92
93
94
@classmethod
@abstractmethod
def read(cls, path: Path) -> nw.DataFrame:
    """Read the data to a dataframe using the format's reader."""

valid_file(path) abstractmethod classmethod

Default format identification based on file extension.

Source code in hdxms_datasets/formats.py
101
102
103
104
@classmethod
@abstractmethod
def valid_file(cls, path: Path) -> bool:
    """Default format identification based on file extension."""

HDExaminer_all_results_with_units

Bases: HDExaminer_all_results

There are some 'all results' files out there which have a variation on the standard columns
where the units are incuded in the column names:
- Peak Width > Peak Width Da
- m/z Shift > m/z Shift Da

Source code in hdxms_datasets/formats.py
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
class HDExaminer_all_results_with_units(HDExaminer_all_results):
    """
    There are some 'all results' files out there which have a variation on the standard columns
    where the units are incuded in the column names:
    - Peak Width > Peak Width Da
    - m/z Shift > m/z Shift Da
    """

    returned_columns = [
        "Protein State",
        "Deut Time",
        "Experiment",
        "Start",
        "End",
        "Sequence",
        "Charge",
        "Search RT",
        "Actual RT",
        "# Spectra",
        "Peak Width Da",
        "m/z Shift Da",
        "Max Inty",
        "Exp Cent",
        "Theor Cent",
        "Score",
        "Cent Diff",
        "# Deut",
        "Deut %",
        "Confidence",
    ]

HDExaminer_peptide_pool

Bases: FormatSpec

HDExaminer Peptide Pool format specification

This file consists of an inital block of 8 columns (first 8 in returned_columns),
followed by a repeating number of typically 8 columns per exposure (the last 8 in returned_columns).
The repeating columns blocks might have 6 columns for FD control blocks. These columns are:

['Start RT', 'End RT', '#D', '%Max D', 'Score', 'Conf']

The first line in this file is header with exposure times, in seconds formatted as '10s', or 'Full-D'
for the full deuterated control.

Reading the file returns an additional "Exposure" column with values derived from the header line.

Source code in hdxms_datasets/formats.py
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
class HDExaminer_peptide_pool(FormatSpec):
    """HDExaminer Peptide Pool format specification

    This file consists of an inital block of 8 columns (first 8 in returned_columns),
    followed by a repeating number of typically 8 columns per exposure (the last 8 in returned_columns).
    The repeating columns blocks might have 6 columns for FD control blocks. These columns are:

    >> ['Start RT', 'End RT', '#D', '%Max D', 'Score', 'Conf']

    The first line in this file is header with exposure times, in seconds formatted as '10s', or 'Full-D'
    for the full deuterated control.

    Reading the file returns an additional "Exposure" column with values derived from the header line.

    """

    returned_columns = [
        "State",
        "Protein",
        "Start",
        "End",
        "Sequence",
        "Search RT",
        "Charge",
        "Max D",
        "Start RT",
        "End RT",
        "#D",
        "%D",
        "#D right",
        "%D right",
        "Score",
        "Conf",
        "Exposure",
    ]

    filter_columns = ["Protein", "State", "Exposure"]
    aggregated = False

    @classmethod
    def read(cls, path: Path) -> nw.DataFrame:
        return read_hdexaminer_peptide_pool(path)

    @classmethod
    def convert(cls, df: nw.DataFrame) -> nw.DataFrame:
        return from_hdexaminer_peptide_pool(df)

    @classmethod
    def valid_file(cls, path: Path) -> bool:
        columns = read_columns(path, line=1)
        return set(cls.returned_columns[:-1]).issubset(set(columns))

HDExaminer_uptake_summary

Bases: FormatSpec

HDExaminer Uptake Summary Table format specification.

This is the output of HD Examiner's "Uptake Summary Table" report.

The resulting table is aggregated, ie only one row per peptide/timepoint.

Full deuterated control is labelled as 'MAX', exposure times are in seconds,
e.g. '10', '100', etc.

Typically includes nondeuterated control as '0'.

Source code in hdxms_datasets/formats.py
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
class HDExaminer_uptake_summary(FormatSpec):
    """HDExaminer Uptake Summary Table format specification.

    This is the output of HD Examiner's "Uptake Summary Table" report.

    The resulting table is aggregated, ie only one row per peptide/timepoint.

    Full deuterated control is labelled as 'MAX', exposure times are in seconds,
    e.g. '10', '100', etc.

    Typically includes nondeuterated control as '0'.

    """

    returned_columns = [
        "Protein State",
        "Protein",
        "Start",
        "End",
        "Sequence",
        "Peptide Mass",
        "RT (min)",
        "Deut Time (sec)",
        "maxD",
        "Theor Uptake #D",
        "#D",
        "%D",
        "Conf Interval (#D)",
        "#Rep",
        "Confidence",
        "Stddev",
        "p",
    ]
    filter_columns = ["Protein", "Protein State", "Deut Time (sec)"]
    aggregated = True

    @classmethod
    def read(cls, path: Path) -> nw.DataFrame:
        return read_csv(path)

    @classmethod
    def convert(cls, df: nw.DataFrame) -> nw.DataFrame:
        return from_hdexaminer_uptake_summary(df)

    @classmethod
    def valid_file(cls, path: Path) -> bool:
        columns = read_columns(path)
        return set(cls.returned_columns).issubset(set(columns))

identify_format(path)

Identify format from file path by reading a sample of the data.

Source code in hdxms_datasets/formats.py
469
470
471
472
473
474
475
def identify_format(path: Path) -> type[FormatSpec]:
    """Identify format from file path by reading a sample of the data."""

    for fmt in FMT_REGISTRY.values():
        if fmt.valid_file(path):
            return fmt
    raise ValueError(f"Could not identify format for file: {path}")

is_aggregated(df)

Checks if a open-hdx formatted DataFrame is aggregated.

A DataFrame is considered aggregated if it containns only one replicate or
replicates are already averaged, ie if there is only one entry per
unique combination of protein, state, start, end, and exposure.

Source code in hdxms_datasets/formats.py
455
456
457
458
459
460
461
462
463
464
465
466
def is_aggregated(df: nw.DataFrame) -> bool:
    """Checks if a open-hdx formatted DataFrame is aggregated.

    A DataFrame is considered aggregated if it containns only one replicate or
    replicates are already averaged, ie if there is only one entry per
    unique combination of protein, state, start, end, and exposure.

    """
    identifier_columns = ["protein", "state", "start", "end", "exposure"]
    by = set(identifier_columns) & set(df.columns)
    unique = df.unique(subset=list(by))
    return len(unique) == len(df)