Skip to content

verification

build_structure_peptides_comparison(structure, peptides)

Compares residue numbering and identity between a structure and peptides.

Applies any residue offset defined in the peptides' structure mapping.

Returns:

Type Description
DataFrame

A DataFrame merging structure and peptide residue information.

Source code in hdxms_datasets/verification.py
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
def build_structure_peptides_comparison(
    structure: Structure,
    peptides: Peptides,
) -> pl.DataFrame:
    """
    Compares residue numbering and identity between a structure and peptides.

    Applies any residue offset defined in the peptides' structure mapping.

    Returns:
        A DataFrame merging structure and peptide residue information.
    """
    structure_df = residue_df_from_structure(structure, peptides.structure_mapping)
    residue_df = residue_df_from_peptides(peptides)

    import polars as pl

    # apply residue offset to peptides
    # doesnt support 'mapping' dicts yet
    if peptides.structure_mapping.mapping:
        raise NotImplementedError("Custom residue mapping is not supported yet.")

    residue_df = residue_df.with_columns(
        (pl.col("resi") + peptides.structure_mapping.residue_offset).cast(str).alias("resi")
    )

    chains = (
        peptides.structure_mapping.chain
        if peptides.structure_mapping.chain is not None
        else structure_df["chain"].unique().to_list()
    )

    # supplement the residue_df with all chains
    # multi-chain peptides are expected to correspond to homomultimers
    import polars as pl

    residue_df_chain = pl.concat(
        [residue_df.with_columns(pl.lit(ch).alias("chain")) for ch in chains]
    )

    merged = residue_df_chain.join(
        structure_df,
        on=["chain", "resi"],
        how="left",
    )

    return merged

compare_structure_peptides(structure, peptides, returns='dict')

compare_structure_peptides(
    structure: Structure,
    peptides: Peptides,
    returns: Literal["dict"] = "dict",
) -> CompareSummary
compare_structure_peptides(
    structure: Structure,
    peptides: Peptides,
    returns: Literal["df"],
) -> pl.DataFrame
compare_structure_peptides(
    structure: Structure,
    peptides: Peptides,
    returns: Literal["both"],
) -> tuple[CompareSummary, pl.DataFrame]

Compares structure and peptide data.

Returned dataframe has the following columns:
resi (str): Residue number from peptides
resn: (str): One letter amino acid code from peptides
resn_TLA: (str): Three letter amino acid code from peptides
chain (str): Chain identifier
resn_TLA_right: (str): Three letter amino acid code from structure (null if no match)

Source code in hdxms_datasets/verification.py
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
def compare_structure_peptides(
    structure: Structure,
    peptides: Peptides,
    returns: Literal["dict", "df", "both"] = "dict",
) -> pl.DataFrame | CompareSummary | tuple[CompareSummary, pl.DataFrame]:
    """Compares structure and peptide data.

    Returned dataframe has the following columns:
    resi (str): Residue number from peptides
    resn: (str): One letter amino acid code from peptides
    resn_TLA: (str): Three letter amino acid code from peptides
    chain (str): Chain identifier
    resn_TLA_right: (str): Three letter amino acid code from structure (null if no match)

    """

    df = build_structure_peptides_comparison(structure, peptides)

    if returns == "dict":
        return summarize_compare_table(df)
    elif returns == "df":
        return df
    elif returns == "both":
        return summarize_compare_table(df), df
    else:
        raise ValueError(f"Invalid returns value: {returns!r}")

datafiles_exist(dataset)

Check if the data files for all peptides and structures in the dataset exist.

Source code in hdxms_datasets/verification.py
65
66
67
68
69
70
71
72
73
74
75
def datafiles_exist(dataset: HDXDataSet) -> bool:
    """
    Check if the data files for all peptides and structures in the dataset exist.
    """
    for state in dataset.states:
        for peptides in state.peptides:
            if not peptides.data_file.exists():
                return False
    if not dataset.structure.data_file.exists():
        return False
    return True

residue_df_from_peptides(peptides)

Create a dataframe from the peptides with resi, resn_TLA.

Parameters:

Name Type Description Default
peptides Peptides

Peptides object

required

Returns:

Type Description
DataFrame

DataFrame with columns:
resi: residue number (int)
resn: one letter amino acid code (str)
resn_TLA: three letter amino acid code (str)

Source code in hdxms_datasets/verification.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
def residue_df_from_peptides(peptides: Peptides) -> pl.DataFrame:
    """Create a dataframe from the peptides with resi, resn_TLA.

    Args:
        peptides: Peptides object

    Returns:
        DataFrame with columns:
            resi: residue number (int)
            resn: one letter amino acid code (str)
            resn_TLA: three letter amino acid code (str)


    """
    from Bio.Data import IUPACData
    import polars as pl

    one_to_three = IUPACData.protein_letters_1to3

    peptide_df = peptides.load().to_native()

    start, end = peptide_df["start"].min(), peptide_df["end"].max()
    residues = range(start, end + 1)
    known_sequence = "X" * len(residues)
    sequence = reconstruct_sequence(peptide_df, known_sequence, n_term=start)

    residue_df = (
        pl.DataFrame({"resi": residues, "resn": list(sequence)})
        .filter(pl.col("resn") != "X")
        .with_columns(
            [
                pl.col("resi"),
                pl.col("resn").replace_strict(one_to_three).str.to_uppercase().alias("resn_TLA"),
            ]
        )
    )

    return residue_df

residue_df_from_structure(structure, mapping=StructureMapping())

Create a dataframe from the structure with chain, resi, resn_TLA

Source code in hdxms_datasets/verification.py
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
def residue_df_from_structure(
    structure: Structure, mapping: StructureMapping = StructureMapping()
) -> pl.DataFrame:
    """Create a dataframe from the structure with chain, resi, resn_TLA"""
    if structure.format not in ["cif", "mmcif"]:
        raise ValueError(f"Unsupported structure format: {structure.format}")

    # create the reference structure amino acid dataframe
    from Bio.PDB.MMCIF2Dict import MMCIF2Dict
    from Bio.Data import IUPACData
    import polars as pl

    mm = MMCIF2Dict(structure.data_file)

    one_to_three = IUPACData.protein_letters_1to3
    AA_codes = [a.upper() for a in list(one_to_three.values())]

    # create a dataframe with chain, resi, resn_TLA from structure
    chain_name = "label_asym_id" if not mapping.auth_chain_labels else "auth_asym_id"
    resn_name = "label_seq_id" if not mapping.auth_residue_numbers else "auth_seq_id"

    structure_df = (
        pl.DataFrame(
            {
                "chain": mm["_atom_site." + chain_name],
                "resi": mm["_atom_site." + resn_name],
                "resn_TLA": [s.upper() for s in mm["_atom_site.label_comp_id"]],
            }
        )
        .unique()
        .filter(pl.col("resn_TLA").is_in(AA_codes))
    )

    return structure_df

residue_offset_optimization(structure, peptides, search_range=(-100, 100))

Optimize residue offset to maximize matched residues between structure and peptides.
Ignores current offset on the peptides' structure mapping.

Parameters:

Name Type Description Default
structure Structure

Structure object

required
peptides Peptides

Peptides object

required
search_range tuple[int, int]

Tuple of (min_offset, max_offset) to search

(-100, 100)

Returns:

Type Description
int

The optimal residue offset as an integer.

Source code in hdxms_datasets/verification.py
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
def residue_offset_optimization(
    structure: Structure,
    peptides: Peptides,
    search_range: tuple[int, int] = (-100, 100),
) -> int:
    """
    Optimize residue offset to maximize matched residues between structure and peptides.
    Ignores current offset on the peptides' structure mapping.

    Args:
        structure: Structure object
        peptides: Peptides object
        search_range: Tuple of (min_offset, max_offset) to search

    Returns:
        The optimal residue offset as an integer.
    """

    import polars as pl

    structure_df = residue_df_from_structure(structure, peptides.structure_mapping)
    residue_df = residue_df_from_peptides(peptides)
    residue_df = pl.concat(
        [
            residue_df.with_columns(pl.lit(ch).alias("chain"))
            for ch in peptides.structure_mapping.chain or []
        ]
    )

    best_offset = search_range[0]
    prev_result = -float("inf")
    for offset in range(*search_range):
        residue_df_with_offset = residue_df.with_columns(
            (pl.col("resi").cast(int) + offset).cast(str).alias("resi")
        )

        merged = residue_df_with_offset.join(
            structure_df,
            on=["chain", "resi"],
            how="left",
        )

        matched = merged.drop_nulls(subset=["resn_TLA_right"]).filter(
            pl.col("resn_TLA") == pl.col("resn_TLA_right")
        )

        result = matched.height
        if result > prev_result:
            best_offset = offset
            prev_result = result

    return best_offset

summarize_compare_table(df)

Derive the metrics from the merged table.

Source code in hdxms_datasets/verification.py
228
229
230
231
232
233
234
235
236
237
238
239
240
def summarize_compare_table(df: pl.DataFrame) -> CompareSummary:
    """Derive the metrics from the merged table."""
    # rows with a structure match (adjust column names to your schema)
    import polars as pl

    matched = df.drop_nulls(subset=["resn_TLA_right"])
    identical = matched.filter(pl.col("resn_TLA") == pl.col("resn_TLA_right"))

    return {
        "total_residues": df.height,
        "matched_residues": matched.height,
        "identical_residues": identical.height,
    }

verify_dataset(dataset, strict=True)

Verify the integrity of the dataset by checking sequences and data files.

Source code in hdxms_datasets/verification.py
11
12
13
14
15
16
17
18
19
20
21
def verify_dataset(dataset: HDXDataSet, strict: bool = True):
    """Verify the integrity of the dataset by checking sequences and data files."""
    verify_peptides(dataset)
    if not datafiles_exist(dataset):
        raise ValueError("Missing datafiles")

    if dataset.file_hash is None:
        raise ValueError("Dataset file hash is not set")

    if strict:
        verify_version(dataset)

verify_peptides(dataset)

Verify that all peptide sequences match the protein sequence in the dataset states.

Source code in hdxms_datasets/verification.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def verify_peptides(dataset: HDXDataSet):
    """Verify that all peptide sequences match the protein sequence in the dataset states."""
    for state in dataset.states:
        sequence = state.protein_state.sequence
        for i, peptides in enumerate(state.peptides):
            try:
                peptide_table = peptides.load()
            except Exception as e:
                raise ValueError(f"State: {state.name}, Peptides[{i}] failed to load: {e}")
            try:
                mismatches = verify_sequence(
                    peptide_table, sequence, n_term=state.protein_state.n_term
                )
            except IndexError as e:
                raise IndexError(f"State: {state.name}, Peptides[{i}] has an index error: {e}")
            if mismatches:
                raise ValueError(
                    f"State: {state.name}, Peptides[{i}] does not match protein sequence, mismatches: {mismatches}"
                )

verify_version(dataset)

Verify that the dataset was created with a pep 440 compliant version.

Source code in hdxms_datasets/verification.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
def verify_version(dataset: HDXDataSet):
    """Verify that the dataset was created with a pep 440 compliant version."""
    ver_str = dataset.metadata.package_version

    v = Version(ver_str)  # type: ignore

    if v.pre:
        raise ValueError(
            f"A pre-release version of `hdxms-datasets` was used to create this dataset: {ver_str}"
        )
    if v.dev:
        raise ValueError(
            f"A development version of `hdxms-datasets` was used to create this dataset: {ver_str}"
        )
    if v.local:
        raise ValueError(
            f"A local version of `hdxms-datasets` was used to create this dataset: {ver_str}"
        )