Skip to content

process

TemperatureDict

Bases: TypedDict

TypedDict for temperature dictionary.

Source code in hdxms_datasets/process.py
143
144
145
146
147
class TemperatureDict(TypedDict):
    """TypedDict for temperature dictionary."""

    value: float
    unit: Literal["C", "K"]

aggregate_columns(df, columns, by=['start', 'end', 'exposure'])

Aggregate the DataFrame the specified columns by intensity-weighted average.

Source code in hdxms_datasets/process.py
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
def aggregate_columns(
    df: nw.DataFrame, columns: list[str], by: list[str] = ["start", "end", "exposure"]
):
    """
    Aggregate the DataFrame the specified columns by intensity-weighted average.
    """
    groups = df.group_by(by)
    output = {k: [] for k in by}
    for col in columns:
        output[col] = []
        output[f"{col}_sd"] = []

    for (start, end, exposure), df_group in groups:
        output["start"].append(start)
        output["end"].append(end)
        output["exposure"].append(exposure)

        for col in columns:
            val = ufloat_stats(df_group[col], df_group["intensity"])
            output[col].append(val.nominal_value)
            output[f"{col}_sd"].append(val.std_dev)

    agg_df = nw.from_dict(output, backend=BACKEND)
    return agg_df

compute_uptake_metrics(df, exception='raise')

Tries to add derived columns to the DataFrame. Possible columns to add are: uptake, uptake_sd, fd_uptake, fd_uptake_sd, rfu, max_uptake.

Source code in hdxms_datasets/process.py
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
def compute_uptake_metrics(df: nw.DataFrame, exception="raise") -> nw.DataFrame:
    """
    Tries to add derived columns to the DataFrame.
    Possible columns to add are: uptake, uptake_sd, fd_uptake, fd_uptake_sd, rfu, max_uptake.
    """
    all_columns = {
        "uptake": hdx_expr.uptake,
        "uptake_sd": hdx_expr.uptake_sd,
        "fd_uptake": hdx_expr.fd_uptake,
        "fd_uptake_sd": hdx_expr.fd_uptake_sd,
        "rfu": hdx_expr.rfu,
        "max_uptake": hdx_expr.max_uptake,
    }

    for col, expr in all_columns.items():
        if col not in df.columns:
            try:
                df = df.with_columns(expr)
            except Exception as e:
                if exception == "raise":
                    raise e
                elif exception == "warn":
                    warnings.warn(f"Failed to add column {col}: {e}")
                elif exception == "ignore":
                    pass
                else:
                    raise ValueError("Invalid exception handling option")

    return df

convert_temperature(temperature_dict, target_unit='C')

Convenience function to convert temperature values.

Parameters:

Name Type Description Default
temperature_dict TemperatureDict

Dictionary with temperature value(s) and unit.

required
target_unit str

Target unit for temperature. Must be "C, or "K"

'C'

Returns:

Type Description
float

Converted temperature value(s).

Source code in hdxms_datasets/process.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
def convert_temperature(temperature_dict: TemperatureDict, target_unit: str = "C") -> float:
    """
    Convenience function to convert temperature values.

    Args:
        temperature_dict: Dictionary with temperature value(s) and unit.
        target_unit: Target unit for temperature. Must be "C, or "K"

    Returns:
        Converted temperature value(s).
    """

    src_unit = temperature_dict["unit"]
    temp_offset = TEMPERATURE_OFFSETS[src_unit] - TEMPERATURE_OFFSETS[target_unit]
    return temperature_dict["value"] + temp_offset

convert_time(time_dict, target_unit='s')

Convenience function to convert time values.

Parameters:

Name Type Description Default
time_dict dict

Dictionary with time value(s) and unit.

required
target_unit Literal['s', 'min', 'h']

Target unit for time.

's'

Returns:

Type Description
Union[float, list[float]]

Converted time value(s).

Source code in hdxms_datasets/process.py
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
def convert_time(
    time_dict: dict, target_unit: Literal["s", "min", "h"] = "s"
) -> Union[float, list[float]]:
    """
    Convenience function to convert time values.

    Args:
        time_dict: Dictionary with time value(s) and unit.
        target_unit: Target unit for time.

    Returns:
        Converted time value(s).
    """
    raise DeprecationWarning()
    src_unit = time_dict["unit"]

    time_factor = TIME_FACTORS[src_unit] / TIME_FACTORS[target_unit]
    if values := time_dict.get("values"):
        return [v * time_factor for v in values]
    elif value := time_dict.get("value"):
        return value * time_factor
    else:
        raise ValueError("Invalid time dictionary")

drop_null_columns(df)

Drop columns that are all null from the DataFrame.

Source code in hdxms_datasets/process.py
329
330
331
332
def drop_null_columns(df: nw.DataFrame) -> nw.DataFrame:
    """Drop columns that are all null from the DataFrame."""
    all_null_columns = [col for col in df.columns if df[col].is_null().all()]
    return df.drop(all_null_columns)

dynamx_cluster_to_state(cluster_data, nd_exposure=0.0)

convert dynamx cluster data to state data must contain only a single state

Source code in hdxms_datasets/process.py
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
def dynamx_cluster_to_state(cluster_data: nw.DataFrame, nd_exposure: float = 0.0) -> nw.DataFrame:
    """
    convert dynamx cluster data to state data
    must contain only a single state
    """

    assert len(cluster_data["state"].unique()) == 1, "Multiple states found in data"

    # determine undeuterated masses per peptide
    nd_data = cluster_data.filter(nw.col("exposure") == nd_exposure)
    nd_peptides: list[tuple[int, int]] = sorted(
        {(start, end) for start, end in zip(nd_data["start"], nd_data["end"])}
    )

    peptides_nd_mass = {}
    for p in nd_peptides:
        start, end = p
        df_nd_peptide = nd_data.filter((nw.col("start") == start) & (nw.col("end") == end))

        masses = df_nd_peptide["z"] * (df_nd_peptide["center"] - PROTON_MASS)
        nd_mass = ufloat_stats(masses, df_nd_peptide["inten"])

        peptides_nd_mass[p] = nd_mass

    groups = cluster_data.group_by(["start", "end", "exposure"])
    unique_columns = [
        "end",
        "exposure",
        "fragment",
        "maxuptake",
        "mhp",
        "modification",
        "protein",
        "sequence",
        "start",
        "state",
        "stop",
    ]
    records = []
    for (start, end, exposure), df_group in groups:
        record = {col: df_group[col][0] for col in unique_columns}

        rt = ufloat_stats(df_group["rt"], df_group["inten"])
        record["rt"] = rt.nominal_value
        record["rt_sd"] = rt.std_dev

        # state data 'center' is mass as if |charge| would be 1
        center = ufloat_stats(
            df_group["z"] * (df_group["center"] - PROTON_MASS) + PROTON_MASS, df_group["inten"]
        )
        record["center"] = center.nominal_value
        record["center_sd"] = center.std_dev

        masses = df_group["z"] * (df_group["center"] - PROTON_MASS)
        exp_mass = ufloat_stats(masses, df_group["inten"])

        if (start, end) in peptides_nd_mass:
            uptake = exp_mass - peptides_nd_mass[(start, end)]
            record["uptake"] = uptake.nominal_value
            record["uptake_sd"] = uptake.std_dev
        else:
            record["uptake"] = None
            record["uptake_sd"] = None

        records.append(record)

    d = records_to_dict(records)
    df = nw.from_dict(d, backend=BACKEND)

    if set(df.columns) == set(STATE_DATA_COLUMN_ORDER):
        df = df[STATE_DATA_COLUMN_ORDER]

    return df

filter_peptides(df, state=None, exposure=None)

Convenience function to filter a peptides DataFrame. .

Parameters:

Name Type Description Default
df DataFrame

Input dataframe.

required
state Optional[str]

Name of protein state to select.

None
exposure Optional[dict]

Exposure value(s) to select. Exposure is given as a :obj:dict, with keys "value" or "values" for exposure value, and "unit" for the time unit.

None
time_unit

Time unit for exposure column of supplied dataframe.

required

Examples:

Filter peptides for a specific protein state and exposure time:

>>> d = {"state", "SecB WT apo", "exposure": {"value": 0.167, "unit": "min"}
>>> filtered_df = filter_peptides(df, **d)

Returns:

Type Description
DataFrame

Filtered dataframe.

Source code in hdxms_datasets/process.py
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
def filter_peptides(
    df: nw.DataFrame,
    state: Optional[str] = None,
    exposure: Optional[dict] = None,
) -> nw.DataFrame:
    """
    Convenience function to filter a peptides DataFrame. .

    Args:
        df: Input dataframe.
        state: Name of protein state to select.
        exposure: Exposure value(s) to select. Exposure is given as a :obj:`dict`, with keys "value" or "values" for
            exposure value, and "unit" for the time unit.
        time_unit: Time unit for exposure column of supplied dataframe.

    Examples:
        Filter peptides for a specific protein state and exposure time:

        >>> d = {"state", "SecB WT apo", "exposure": {"value": 0.167, "unit": "min"}
        >>> filtered_df = filter_peptides(df, **d)

    Returns:
        Filtered dataframe.
    """
    raise DeprecationWarning()
    if state is not None:
        df = df.filter(nw.col("state") == state)

    if exposure is not None:
        # NA unit is used when exposure is given as string, in case of HD examiner this can be 'FD'
        if exposure["unit"] == "NA":
            t_val = exposure["value"]
        else:
            t_val = convert_time(exposure, "s")
        if isinstance(t_val, list):
            if all(isinstance(v, float) for v in t_val):
                col = nw.col("exposure")
            elif all(isinstance(v, str) for v in t_val):
                col = nw.col("exposure").cast(nw.Float64)
            else:
                raise ValueError("Invalid exposure values")
            df = df.filter(col.is_in(t_val))
        else:
            df = df.filter(nw.col("exposure") == t_val)

    return df

records_to_dict(records)

Convert a list of records to a dictionary of lists.

Parameters:

Name Type Description Default
records list[dict]

List of dictionaries.

required

Returns:

Type Description
dict[str, list]

Dictionary with keys as column names and values as lists.

Source code in hdxms_datasets/process.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
def records_to_dict(records: list[dict]) -> dict[str, list]:
    """
    Convert a list of records to a dictionary of lists.

    Args:
        records: List of dictionaries.

    Returns:
        Dictionary with keys as column names and values as lists.
    """

    df_dict = defaultdict(list)
    for record in records:
        for key, value in record.items():
            df_dict[key].append(value)

    return dict(df_dict)

sort(df)

Sorts the DataFrame by state, exposure, start, end, file.

Source code in hdxms_datasets/process.py
322
323
324
325
326
def sort(df: nw.DataFrame) -> nw.DataFrame:
    """Sorts the DataFrame by state, exposure, start, end, file."""
    all_by = ["state", "exposure", "start", "end", "file"]
    by = [col for col in all_by if col in df.columns]
    return df.sort(by=by)

ufloat_stats(array, weights)

Calculate the weighted mean and standard deviation.

Source code in hdxms_datasets/process.py
43
44
45
46
def ufloat_stats(array, weights) -> Variable:
    """Calculate the weighted mean and standard deviation."""
    weighted_stats = DescrStatsW(array, weights=weights, ddof=0)
    return ufloat(weighted_stats.mean, weighted_stats.std)