Skip to content

process

TemperatureDict

Bases: TypedDict

TypedDict for temperature dictionary.

Source code in hdxms_datasets/process.py
147
148
149
150
151
class TemperatureDict(TypedDict):
    """TypedDict for temperature dictionary."""

    value: float
    unit: Literal["C", "K"]

aggregate_columns(df, columns, by=['start', 'end', 'exposure'])

Aggregate the DataFrame the specified columns by intensity-weighted average.

Source code in hdxms_datasets/process.py
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
def aggregate_columns(
    df: nw.DataFrame, columns: list[str], by: list[str] = ["start", "end", "exposure"]
):
    """
    Aggregate the DataFrame the specified columns by intensity-weighted average.
    """
    groups = df.group_by(by)
    output = {k: [] for k in by}
    for col in columns:
        output[col] = []
        output[f"{col}_sd"] = []

    for (start, end, exposure), df_group in groups:
        output["start"].append(start)
        output["end"].append(end)
        output["exposure"].append(exposure)

        for col in columns:
            val = ufloat_stats(df_group[col], df_group["intensity"])
            output[col].append(val.nominal_value)
            output[f"{col}_sd"].append(val.std_dev)

    agg_df = nw.from_dict(output, backend=BACKEND)
    return agg_df

compute_uptake_metrics(df, exception='raise')

Tries to add derived columns to the DataFrame. Possible columns to add are: uptake, uptake_sd, fd_uptake, fd_uptake_sd, rfu, max_uptake.

Source code in hdxms_datasets/process.py
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
def compute_uptake_metrics(df: nw.DataFrame, exception="raise") -> nw.DataFrame:
    """
    Tries to add derived columns to the DataFrame.
    Possible columns to add are: uptake, uptake_sd, fd_uptake, fd_uptake_sd, rfu, max_uptake.
    """
    all_columns = {
        "uptake": hdx_expr.uptake,
        "uptake_sd": hdx_expr.uptake_sd,
        "fd_uptake": hdx_expr.fd_uptake,
        "fd_uptake_sd": hdx_expr.fd_uptake_sd,
        "rfu": hdx_expr.rfu,
        "max_uptake": hdx_expr.max_uptake,
    }

    for col, expr in all_columns.items():
        if col not in df.columns:
            try:
                df = df.with_columns(expr)
            except Exception as e:
                if exception == "raise":
                    raise e
                elif exception == "warn":
                    warnings.warn(f"Failed to add column {col}: {e}")
                elif exception == "ignore":
                    pass
                else:
                    raise ValueError("Invalid exception handling option")

    return df

convert_temperature(temperature_dict, target_unit='C')

Convenience function to convert temperature values.

Parameters:

Name Type Description Default
temperature_dict TemperatureDict

Dictionary with temperature value(s) and unit.

required
target_unit str

Target unit for temperature. Must be "C, or "K"

'C'

Returns:

Type Description
float

Converted temperature value(s).

Source code in hdxms_datasets/process.py
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
def convert_temperature(temperature_dict: TemperatureDict, target_unit: str = "C") -> float:
    """
    Convenience function to convert temperature values.

    Args:
        temperature_dict: Dictionary with temperature value(s) and unit.
        target_unit: Target unit for temperature. Must be "C, or "K"

    Returns:
        Converted temperature value(s).
    """

    src_unit = temperature_dict["unit"]
    temp_offset = TEMPERATURE_OFFSETS[src_unit] - TEMPERATURE_OFFSETS[target_unit]
    return temperature_dict["value"] + temp_offset

convert_time(time_dict, target_unit='s')

Convenience function to convert time values.

Parameters:

Name Type Description Default
time_dict dict

Dictionary with time value(s) and unit.

required
target_unit Literal['s', 'min', 'h']

Target unit for time.

's'

Returns:

Type Description
Union[float, list[float]]

Converted time value(s).

Source code in hdxms_datasets/process.py
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
def convert_time(
    time_dict: dict, target_unit: Literal["s", "min", "h"] = "s"
) -> Union[float, list[float]]:
    """
    Convenience function to convert time values.

    Args:
        time_dict: Dictionary with time value(s) and unit.
        target_unit: Target unit for time.

    Returns:
        Converted time value(s).
    """
    raise DeprecationWarning()
    src_unit = time_dict["unit"]

    time_factor = TIME_FACTORS[src_unit] / TIME_FACTORS[target_unit]
    if values := time_dict.get("values"):
        return [v * time_factor for v in values]
    elif value := time_dict.get("value"):
        return value * time_factor
    else:
        raise ValueError("Invalid time dictionary")

drop_null_columns(df)

Drop columns that are all null from the DataFrame.

Source code in hdxms_datasets/process.py
359
360
361
362
def drop_null_columns(df: nw.DataFrame) -> nw.DataFrame:
    """Drop columns that are all null from the DataFrame."""
    all_null_columns = [col for col in df.columns if df[col].is_null().all()]
    return df.drop(all_null_columns)

dynamx_cluster_to_state(cluster_data, nd_exposure=0.0)

convert dynamx cluster data to state data must contain only a single state

Source code in hdxms_datasets/process.py
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
def dynamx_cluster_to_state(cluster_data: nw.DataFrame, nd_exposure: float = 0.0) -> nw.DataFrame:
    """
    convert dynamx cluster data to state data
    must contain only a single state
    """

    assert len(cluster_data["state"].unique()) == 1, "Multiple states found in data"

    # determine undeuterated masses per peptide
    nd_data = cluster_data.filter(nw.col("exposure") == nd_exposure)
    nd_peptides: list[tuple[int, int]] = sorted(
        {(start, end) for start, end in zip(nd_data["start"], nd_data["end"])}
    )

    peptides_nd_mass = {}
    for p in nd_peptides:
        start, end = p
        df_nd_peptide = nd_data.filter((nw.col("start") == start) & (nw.col("end") == end))

        masses = df_nd_peptide["z"] * (df_nd_peptide["center"] - PROTON_MASS)
        nd_mass = ufloat_stats(masses, df_nd_peptide["inten"])

        peptides_nd_mass[p] = nd_mass

    groups = cluster_data.group_by(["start", "end", "exposure"])
    unique_columns = [
        "end",
        "exposure",
        "fragment",
        "maxuptake",
        "mhp",
        "modification",
        "protein",
        "sequence",
        "start",
        "state",
        "stop",
    ]
    records = []
    for (start, end, exposure), df_group in groups:
        record = {col: df_group[col][0] for col in unique_columns}

        rt = ufloat_stats(df_group["rt"], df_group["inten"])
        record["rt"] = rt.nominal_value
        record["rt_sd"] = rt.std_dev

        # state data 'center' is mass as if |charge| would be 1
        center = ufloat_stats(
            df_group["z"] * (df_group["center"] - PROTON_MASS) + PROTON_MASS, df_group["inten"]
        )
        record["center"] = center.nominal_value
        record["center_sd"] = center.std_dev

        masses = df_group["z"] * (df_group["center"] - PROTON_MASS)
        exp_mass = ufloat_stats(masses, df_group["inten"])

        if (start, end) in peptides_nd_mass:
            uptake = exp_mass - peptides_nd_mass[(start, end)]
            record["uptake"] = uptake.nominal_value
            record["uptake_sd"] = uptake.std_dev
        else:
            record["uptake"] = None
            record["uptake_sd"] = None

        records.append(record)

    d = records_to_dict(records)
    df = nw.from_dict(d, backend=BACKEND)

    if set(df.columns) == set(STATE_DATA_COLUMN_ORDER):
        df = df[STATE_DATA_COLUMN_ORDER]

    return df

filter_peptides(df, state=None, exposure=None)

Convenience function to filter a peptides DataFrame. .

Parameters:

Name Type Description Default
df DataFrame

Input dataframe.

required
state Optional[str]

Name of protein state to select.

None
exposure Optional[dict]

Exposure value(s) to select. Exposure is given as a :obj:dict, with keys "value" or "values" for exposure value, and "unit" for the time unit.

None
time_unit

Time unit for exposure column of supplied dataframe.

required

Examples:

Filter peptides for a specific protein state and exposure time:

>>> d = {"state", "SecB WT apo", "exposure": {"value": 0.167, "unit": "min"}
>>> filtered_df = filter_peptides(df, **d)

Returns:

Type Description
DataFrame

Filtered dataframe.

Source code in hdxms_datasets/process.py
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
def filter_peptides(
    df: nw.DataFrame,
    state: Optional[str] = None,
    exposure: Optional[dict] = None,
) -> nw.DataFrame:
    """
    Convenience function to filter a peptides DataFrame. .

    Args:
        df: Input dataframe.
        state: Name of protein state to select.
        exposure: Exposure value(s) to select. Exposure is given as a :obj:`dict`, with keys "value" or "values" for
            exposure value, and "unit" for the time unit.
        time_unit: Time unit for exposure column of supplied dataframe.

    Examples:
        Filter peptides for a specific protein state and exposure time:

        >>> d = {"state", "SecB WT apo", "exposure": {"value": 0.167, "unit": "min"}
        >>> filtered_df = filter_peptides(df, **d)

    Returns:
        Filtered dataframe.
    """
    raise DeprecationWarning()
    if state is not None:
        df = df.filter(nw.col("state") == state)

    if exposure is not None:
        # NA unit is used when exposure is given as string, in case of HD examiner this can be 'FD'
        if exposure["unit"] == "NA":
            t_val = exposure["value"]
        else:
            t_val = convert_time(exposure, "s")
        if isinstance(t_val, list):
            if all(isinstance(v, float) for v in t_val):
                col = nw.col("exposure")
            elif all(isinstance(v, str) for v in t_val):
                col = nw.col("exposure").cast(nw.Float64)
            else:
                raise ValueError("Invalid exposure values")
            df = df.filter(col.is_in(t_val))
        else:
            df = df.filter(nw.col("exposure") == t_val)

    return df

parse_data_files(data_file_spec, data_dir)

Parse data file specifications from a YAML file.

Parameters:

Name Type Description Default
data_file_spec dict

Dictionary with data file specifications.

required
data_dir Path

Path to data directory.

required

Returns:

Type Description
dict[str, DataFile]

Dictionary with parsed data file specifications.

Source code in hdxms_datasets/process.py
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
def parse_data_files(data_file_spec: dict, data_dir: Path) -> dict[str, DataFile]:
    """
    Parse data file specifications from a YAML file.

    Args:
        data_file_spec: Dictionary with data file specifications.
        data_dir: Path to data directory.

    Returns:
        Dictionary with parsed data file specifications.
    """

    from hdxms_datasets import DataFile

    data_files = {}
    for name, spec in data_file_spec.items():
        datafile = DataFile(
            name=name,
            filepath_or_buffer=Path(data_dir / spec["filename"]),
            **{k: v for k, v in spec.items() if k != "filename"},
        )
        data_files[name] = datafile

    return data_files

records_to_dict(records)

Convert a list of records to a dictionary of lists.

Parameters:

Name Type Description Default
records list[dict]

List of dictionaries.

required

Returns:

Type Description
dict[str, list]

Dictionary with keys as column names and values as lists.

Source code in hdxms_datasets/process.py
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def records_to_dict(records: list[dict]) -> dict[str, list]:
    """
    Convert a list of records to a dictionary of lists.

    Args:
        records: List of dictionaries.

    Returns:
        Dictionary with keys as column names and values as lists.
    """

    df_dict = defaultdict(list)
    for record in records:
        for key, value in record.items():
            df_dict[key].append(value)

    return dict(df_dict)

sort(df)

Sorts the DataFrame by state, exposure, start, end, file.

Source code in hdxms_datasets/process.py
352
353
354
355
356
def sort(df: nw.DataFrame) -> nw.DataFrame:
    """Sorts the DataFrame by state, exposure, start, end, file."""
    all_by = ["state", "exposure", "start", "end", "file"]
    by = [col for col in all_by if col in df.columns]
    return df.sort(by=by)

ufloat_stats(array, weights)

Calculate the weighted mean and standard deviation.

Source code in hdxms_datasets/process.py
47
48
49
50
def ufloat_stats(array, weights) -> Variable:
    """Calculate the weighted mean and standard deviation."""
    weighted_stats = DescrStatsW(array, weights=weights, ddof=0)
    return ufloat(weighted_stats.mean, weighted_stats.std)