Skip to content

convert

cast_exposure(df)

Tries to cast the exposure column to float

Source code in hdxms_datasets/convert.py
83
84
85
86
87
88
89
def cast_exposure(df: nw.DataFrame) -> nw.DataFrame:
    """Tries to cast the exposure column to float"""
    try:
        df = df.with_columns(nw.col("exposure").str.strip_chars("s").cast(nw.Float64))
    except (InvalidOperationError, ValueError, AttributeError):
        pass
    return df

convert_rt(rt_str)

Convert HDExaminer retention time string to float
example: "7.44-7.65" -> 7.545

Lossy conversion

This conversion loses information. The full range is not preserved. This was done such that
retention time can be stored as float and thus be aggregated.
Future versions may store the full range with additional rt_min and rt_max columns.

Source code in hdxms_datasets/convert.py
68
69
70
71
72
73
74
75
76
77
78
79
80
def convert_rt(rt_str: str) -> float:
    """Convert HDExaminer retention time string to float
    example: "7.44-7.65" -> 7.545

    !!! warning "Lossy conversion"
        This conversion loses information. The full range is not preserved. This was done such that
        retention time can be stored as float and thus be aggregated.
        Future versions may store the full range with additional `rt_min` and `rt_max` columns.

    """
    vmin, vmax = rt_str.split("-")
    mean = (float(vmin) + float(vmax)) / 2.0
    return mean

from_dynamx_cluster(dynamx_df)

Convert a DynamX cluster DataFrame to OpenHDX format.

Source code in hdxms_datasets/convert.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def from_dynamx_cluster(dynamx_df: nw.DataFrame) -> nw.DataFrame:
    """
    Convert a DynamX cluster DataFrame to OpenHDX format.
    """
    column_mapping = {
        "State": "state",
        "Exposure": "exposure",
        "Start": "start",
        "End": "end",
        "Sequence": "sequence",
        "File": "replicate",
        "z": "charge",
        "Center": "centroid_mz",
        "Inten": "intensity",
        "RT": "rt",
    }

    column_order = list(column_mapping.values())
    column_order.insert(column_order.index("charge") + 1, "centroid_mass")

    df = (
        dynamx_df.rename(column_mapping)
        .with_columns([centroid_mass, nw.col("exposure") * 60.0])
        .select(column_order)
        .sort(by=["state", "exposure", "start", "end", "replicate"])
    )

    return df

from_dynamx_state(dynamx_df)

Convert a DynamX state DataFrame to OpenHDX format.

Source code in hdxms_datasets/convert.py
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
def from_dynamx_state(dynamx_df: nw.DataFrame) -> nw.DataFrame:
    """
    Convert a DynamX state DataFrame to OpenHDX format.
    """
    column_mapping = {
        # TODO add Protein
        "State": "state",
        "Exposure": "exposure",
        "Start": "start",
        "End": "end",
        "Sequence": "sequence",
        "Uptake": "uptake",
        "Uptake SD": "uptake_sd",
        "Center": "centroid_mz",
        "RT": "rt",
        "RT SD": "rt_sd",
    }

    column_order = list(column_mapping.values())

    df = (
        dynamx_df.rename(column_mapping)
        .with_columns([nw.col("exposure") * 60.0])
        .select(column_order)
        .sort(by=["state", "exposure", "start", "end"])
    )

    return df

from_hdexaminer_all_results(hd_examiner_df, extra_columns=None)

Convert an HDExaminer 'All results' exported DataFrame to OpenHDX format.

To export as all results (from HDExaminer documentation):

To export all tables to a .csv file, switch to the Analysis View, then select any experiment.
Select “Tools”, then “Export”, then “All Results Tables…” or right-click on the results table
and select “Export All Tables…”. Specify a filename. HDExaminer will save the combined tables
to that file.

Parameters:

Name Type Description Default
hd_examiner_df DataFrame

DataFrame in HDExaminer format.

required
extra_columns list[str] | dict[str, str] | str | None

Additional columns to include, either as a list/str of column name(s)
or a dictionary mapping original column names to new names.

None

Returns:

Type Description
DataFrame

A DataFrame in OpenHDX format.

Source code in hdxms_datasets/convert.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
def from_hdexaminer_all_results(
    hd_examiner_df: nw.DataFrame,
    extra_columns: list[str] | dict[str, str] | str | None = None,
) -> nw.DataFrame:
    """
    Convert an HDExaminer 'All results' exported DataFrame to OpenHDX format.

    To export as all results (from HDExaminer documentation):

    To export all tables to a .csv file, switch to the Analysis View, then select any experiment.
    Select “Tools”, then “Export”, then “All Results Tables…” or right-click on the results table
    and select “Export All Tables…”. Specify a filename. HDExaminer will save the combined tables
    to that file.

    Args:
        hd_examiner_df: DataFrame in HDExaminer format.
        extra_columns: Additional columns to include, either as a list/str of column name(s)
                       or a dictionary mapping original column names to new names.

    Returns:
        A DataFrame in OpenHDX format.

    """
    from hdxms_datasets.reader import BACKEND

    column_mapping = {
        "Protein State": "state",
        "Deut Time": "exposure",
        "Start": "start",
        "End": "end",
        "Sequence": "sequence",
        "Experiment": "replicate",
        "Charge": "charge",
        "Exp Cent": "centroid_mz",
        "Max Inty": "intensity",
    }

    column_order = list(column_mapping.values())
    column_order.insert(column_order.index("charge") + 1, "centroid_mass")
    column_order.append("rt")

    cols = _fmt_extra_columns(extra_columns)

    column_mapping.update(cols)
    column_order.extend(cols.values())

    # TODO: parse to two columns, start_rt, end_rt
    rt_values = [convert_rt(rt_str) for rt_str in hd_examiner_df["Actual RT"]]
    rt_series = nw.new_series(values=rt_values, name="rt", backend=BACKEND)

    df = (
        hd_examiner_df.rename(column_mapping)
        .with_columns([centroid_mass, rt_series])
        .select(column_order)
        .sort(
            by=["state", "exposure", "start", "end", "replicate"]
        )  # TODO sort by protein first (if available), take from global var
    )

    return cast_exposure(df)

from_hdexaminer_peptide_pool(df)

Convert from hd examiner peptide pool format to OpenHDX format.

Source code in hdxms_datasets/convert.py
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
def from_hdexaminer_peptide_pool(df: nw.DataFrame) -> nw.DataFrame:
    """Convert from hd examiner peptide pool format to OpenHDX format."""
    column_mapping = {
        "State": "state",
        "Exposure": "exposure",
        "Start": "start",
        "End": "end",
        "Sequence": "sequence",
        "Charge": "charge",
        "#D": "uptake",
        "Start RT": "start_rt",
        "End RT": "end_rt",
        "Search RT": "search_rt",
    }

    df = df.rename(column_mapping)
    column_order = list(column_mapping.values())

    df = df.select(column_order)  # .sort(by=["state", "exposure", "start", "end"])

    return cast_exposure(df)

from_hdexaminer_uptake_summary(df)

Convert from hd examiner uptake summary format to OpenHDX format.

Source code in hdxms_datasets/convert.py
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
def from_hdexaminer_uptake_summary(df: nw.DataFrame) -> nw.DataFrame:
    """Convert from hd examiner uptake summary format to OpenHDX format."""
    column_mapping = {
        "Protein": "protein",
        "Protein State": "state",
        "Start": "start",
        "End": "end",
        "Deut Time (sec)": "exposure",
        #'Peptide Mass' ?,
        "Sequence": "sequence",
        "#D": "uptake",
        "RT (min)": "rt",
        "#Rep": "n_replicates",
    }

    df = df.rename(column_mapping)
    column_order = list(column_mapping.values())

    df = df.select(column_order)  # .sort(by=["state", "exposure", "start", "end"])

    return cast_exposure(df)

from_hxms(hxms_df, extra_columns='sequence')

Convert an HXMS DataFrame to OpenHDX format.

Parameters:

Name Type Description Default
hxms_df DataFrame

DataFrame in HXMS format.

required
extra_columns list[str] | dict[str, str] | str | None

Additional columns to include, either as a list/str of column name(s)
or a dictionary mapping original column names to new names.

'sequence'

Returns:

Type Description
DataFrame

A DataFrame in OpenHDX format.

Source code in hdxms_datasets/convert.py
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
def from_hxms(
    hxms_df: nw.DataFrame,
    extra_columns: list[str] | dict[str, str] | str | None = "sequence",
) -> nw.DataFrame:
    """
    Convert an HXMS DataFrame to OpenHDX format.

    Args:
        hxms_df: DataFrame in HXMS format.
        extra_columns: Additional columns to include, either as a list/str of column name(s)
            or a dictionary mapping original column names to new names.

    Returns:
        A DataFrame in OpenHDX format.

    """

    column_mapping = {
        "START": "start",
        "END": "end",
        "REP": "replicate",
        "TIME(Sec)": "exposure",
        "UPTAKE": "uptake",
    }

    column_order = list(column_mapping.values())
    cols = _fmt_extra_columns(extra_columns)
    column_mapping.update(cols)
    column_order.extend(cols.values())

    df = hxms_df.rename(column_mapping)
    df = df.select(column_order).sort(by=["exposure", "start", "end", "replicate"])

    return df