Skip to content

convert

convert_rt(rt_str)

Convert HDExaminer retention time string to float example: "7.44-7.65" -> 7.545

Lossy conversion

This conversion loses information. The full range is not preserved. This was done such that retention time can be stored as float and thus be aggregated. Future versions may store the full range with additional rt_min and rt_max columns.

Source code in hdxms_datasets/convert.py
67
68
69
70
71
72
73
74
75
76
77
78
79
def convert_rt(rt_str: str) -> float:
    """Convert HDExaminer retention time string to float
    example: "7.44-7.65" -> 7.545

    !!! warning "Lossy conversion"
        This conversion loses information. The full range is not preserved. This was done such that
        retention time can be stored as float and thus be aggregated.
        Future versions may store the full range with additional `rt_min` and `rt_max` columns.

    """
    vmin, vmax = rt_str.split("-")
    mean = (float(vmin) + float(vmax)) / 2.0
    return mean

from_dynamx_cluster(dynamx_df)

Convert a DynamX cluster DataFrame to OpenHDX format.

Source code in hdxms_datasets/convert.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def from_dynamx_cluster(dynamx_df: nw.DataFrame) -> nw.DataFrame:
    """
    Convert a DynamX cluster DataFrame to OpenHDX format.
    """
    column_mapping = {
        "State": "state",
        "Exposure": "exposure",
        "Start": "start",
        "End": "end",
        "Sequence": "sequence",
        "File": "replicate",
        "z": "charge",
        "Center": "centroid_mz",
        "Inten": "intensity",
        "RT": "rt",
    }

    column_order = list(column_mapping.values())
    column_order.insert(column_order.index("charge") + 1, "centroid_mass")

    df = (
        dynamx_df.rename(column_mapping)
        .with_columns([centroid_mass, nw.col("exposure") * 60.0])
        .select(column_order)
        .sort(by=["state", "exposure", "start", "end", "replicate"])
    )

    return df

from_dynamx_state(dynamx_df)

Convert a DynamX state DataFrame to OpenHDX format.

Source code in hdxms_datasets/convert.py
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def from_dynamx_state(dynamx_df: nw.DataFrame) -> nw.DataFrame:
    """
    Convert a DynamX state DataFrame to OpenHDX format.
    """
    column_mapping = {
        "State": "state",
        "Exposure": "exposure",
        "Start": "start",
        "End": "end",
        "Sequence": "sequence",
        "Uptake": "uptake",
        "Uptake SD": "uptake_sd",
        "Center": "centroid_mz",
        "RT": "rt",
        "RT SD": "rt_sd",
    }

    column_order = list(column_mapping.values())

    df = (
        dynamx_df.rename(column_mapping)
        .with_columns([nw.col("exposure") * 60.0])
        .select(column_order)
        .sort(by=["state", "exposure", "start", "end"])
    )

    return df

from_hdexaminer(hd_examiner_df, extra_columns=None)

Convert an HDExaminer DataFrame to OpenHDX format.

Parameters:

Name Type Description Default
hd_examiner_df DataFrame

DataFrame in HDExaminer format.

required
extra_columns list[str] | dict[str, str] | str | None

Additional columns to include, either as a list/str of column name(s) or a dictionary mapping original column names to new names.

None

Returns:

Type Description
DataFrame

A DataFrame in OpenHDX format.

Source code in hdxms_datasets/convert.py
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
def from_hdexaminer(
    hd_examiner_df: nw.DataFrame,
    extra_columns: list[str] | dict[str, str] | str | None = None,
) -> nw.DataFrame:
    """
    Convert an HDExaminer DataFrame to OpenHDX format.

    Args:
        hd_examiner_df: DataFrame in HDExaminer format.
        extra_columns: Additional columns to include, either as a list/str of column name(s)
                       or a dictionary mapping original column names to new names.

    Returns:
        A DataFrame in OpenHDX format.

    """
    from hdxms_datasets.loader import BACKEND

    column_mapping = {
        "Protein State": "state",
        "Deut Time": "exposure",
        "Start": "start",
        "End": "end",
        "Sequence": "sequence",
        "Experiment": "replicate",
        "Charge": "charge",
        "Exp Cent": "centroid_mz",
        "Max Inty": "intensity",
    }

    column_order = list(column_mapping.values())
    column_order.insert(column_order.index("charge") + 1, "centroid_mass")
    column_order.append("rt")

    if isinstance(extra_columns, dict):
        cols = extra_columns
    elif isinstance(extra_columns, list):
        cols = {col: col for col in extra_columns}
    elif isinstance(extra_columns, str):
        cols = {extra_columns: extra_columns}
    elif extra_columns is None:
        cols = {}
    else:
        raise ValueError(
            "additional_columns must be a list or dict, not {}".format(type(extra_columns))
        )

    column_mapping.update(cols)
    column_order.extend(cols.values())

    rt_values = [convert_rt(rt_str) for rt_str in hd_examiner_df["Actual RT"]]
    rt_series = nw.new_series(values=rt_values, name="rt", backend=BACKEND)

    df = (
        hd_examiner_df.rename(column_mapping)
        .with_columns([centroid_mass, rt_series])
        .select(column_order)
        .sort(by=["state", "exposure", "start", "end", "replicate"])
    )

    return cast_exposure(df)