Skip to content

verification

build_structure_peptides_comparison(structure, peptides)

Compares residue numbering and identity between a structure and peptides.

Applies any residue offset defined in the peptides' structure mapping.

Returns:

Type Description
DataFrame

A DataFrame merging structure and peptide residue information.

Source code in hdxms_datasets/verification.py
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
def build_structure_peptides_comparison(
    structure: Structure,
    peptides: Peptides,
) -> pl.DataFrame:
    """
    Compares residue numbering and identity between a structure and peptides.

    Applies any residue offset defined in the peptides' structure mapping.

    Returns:
        A DataFrame merging structure and peptide residue information.
    """
    structure_df = residue_df_from_structure(structure, peptides.structure_mapping)
    residue_df = residue_df_from_peptides(peptides)

    import polars as pl

    # apply residue offset to peptides
    residue_df = residue_df.with_columns(
        (pl.col("resi") + peptides.structure_mapping.residue_offset).cast(str).alias("resi")
    )

    chains = (
        peptides.structure_mapping.chain
        if peptides.structure_mapping.chain is not None
        else structure_df["chain"].unique().to_list()
    )

    # supplement the residue_df with all chains
    # multi-chain peptides are expected to correspond to homomultimers
    import polars as pl

    residue_df_chain = pl.concat(
        [residue_df.with_columns(pl.lit(ch).alias("chain")) for ch in chains]
    )

    merged = residue_df_chain.join(
        structure_df,
        on=["chain", "resi"],
        how="left",
    )

    return merged

compare_structure_peptides(structure, peptides, returns='dict')

compare_structure_peptides(
    structure: Structure,
    peptides: Peptides,
    returns: Literal["dict"] = "dict",
) -> CompareSummary
compare_structure_peptides(
    structure: Structure,
    peptides: Peptides,
    returns: Literal["df"],
) -> pl.DataFrame
compare_structure_peptides(
    structure: Structure,
    peptides: Peptides,
    returns: Literal["both"],
) -> tuple[CompareSummary, pl.DataFrame]

Compares structure and peptide data.

Source code in hdxms_datasets/verification.py
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
def compare_structure_peptides(
    structure: Structure,
    peptides: Peptides,
    returns: Literal["dict", "df", "both"] = "dict",
) -> pl.DataFrame | CompareSummary | tuple[CompareSummary, pl.DataFrame]:
    """Compares structure and peptide data."""

    df = build_structure_peptides_comparison(structure, peptides)

    if returns == "dict":
        return summarize_compare_table(df)
    elif returns == "df":
        return df
    elif returns == "both":
        return summarize_compare_table(df), df
    else:
        raise ValueError(f"Invalid returns value: {returns!r}")

datafiles_exist(dataset)

Check if the data files for all peptides and structures in the dataset exist.

Source code in hdxms_datasets/verification.py
65
66
67
68
69
70
71
72
73
74
75
def datafiles_exist(dataset: HDXDataSet) -> bool:
    """
    Check if the data files for all peptides and structures in the dataset exist.
    """
    for state in dataset.states:
        for peptides in state.peptides:
            if not peptides.data_file.exists():
                return False
    if not dataset.structure.data_file.exists():
        return False
    return True

residue_df_from_peptides(peptides)

Create a dataframe from the peptides with resi, resn_TLA.

Parameters:

Name Type Description Default
peptides Peptides

Peptides object

required

Returns:

Type Description
DataFrame

DataFrame with columns:
resi: residue number (int)
resn: one letter amino acid code (str)
resn_TLA: three letter amino acid code (str)

Source code in hdxms_datasets/verification.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
def residue_df_from_peptides(peptides: Peptides) -> pl.DataFrame:
    """Create a dataframe from the peptides with resi, resn_TLA.

    Args:
        peptides: Peptides object

    Returns:
        DataFrame with columns:
            resi: residue number (int)
            resn: one letter amino acid code (str)
            resn_TLA: three letter amino acid code (str)


    """
    from Bio.Data import IUPACData
    import polars as pl

    one_to_three = IUPACData.protein_letters_1to3

    peptide_df = peptides.load().to_native()

    start, end = peptide_df["start"].min(), peptide_df["end"].max()
    residues = range(start, end + 1)
    known_sequence = "X" * len(residues)
    sequence = reconstruct_sequence(peptide_df, known_sequence, n_term=start)

    residue_df = (
        pl.DataFrame({"resi": residues, "resn": list(sequence)})
        .filter(pl.col("resn") != "X")
        .with_columns(
            [
                pl.col("resi"),
                pl.col("resn").replace_strict(one_to_three).str.to_uppercase().alias("resn_TLA"),
            ]
        )
    )

    return residue_df

residue_df_from_structure(structure, mapping=StructureMapping())

Create a dataframe from the structure with chain, resi, resn_TLA

Source code in hdxms_datasets/verification.py
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
def residue_df_from_structure(
    structure: Structure, mapping: StructureMapping = StructureMapping()
) -> pl.DataFrame:
    """Create a dataframe from the structure with chain, resi, resn_TLA"""
    if structure.format not in ["cif", "mmcif"]:
        raise ValueError(f"Unsupported structure format: {structure.format}")

    # create the reference structure amino acid dataframe
    from Bio.PDB.MMCIF2Dict import MMCIF2Dict
    from Bio.Data import IUPACData
    import polars as pl

    mm = MMCIF2Dict(structure.data_file)

    one_to_three = IUPACData.protein_letters_1to3
    AA_codes = [a.upper() for a in list(one_to_three.values())]

    # create a dataframe with chain, resi, resn_TLA from structure
    chain_name = "label_asym_id" if not mapping.auth_chain_labels else "auth_asym_id"
    resn_name = "label_seq_id" if not mapping.auth_residue_numbers else "auth_seq_id"

    structure_df = (
        pl.DataFrame(
            {
                "chain": mm["_atom_site." + chain_name],
                "resi": mm["_atom_site." + resn_name],
                "resn_TLA": [s.upper() for s in mm["_atom_site.label_comp_id"]],
            }
        )
        .unique()
        .filter(pl.col("resn_TLA").is_in(AA_codes))
    )

    return structure_df

summarize_compare_table(df)

Derive the metrics from the merged table.

Source code in hdxms_datasets/verification.py
224
225
226
227
228
229
230
231
232
233
234
235
236
def summarize_compare_table(df: pl.DataFrame) -> CompareSummary:
    """Derive the metrics from the merged table."""
    # rows with a structure match (adjust column names to your schema)
    import polars as pl

    matched = df.drop_nulls(subset=["resn_TLA_right"])
    identical = matched.filter(pl.col("resn_TLA") == pl.col("resn_TLA_right"))

    return {
        "total_residues": df.height,
        "matched_residues": matched.height,
        "identical_residues": identical.height,
    }

verify_dataset(dataset, strict=True)

Verify the integrity of the dataset by checking sequences and data files.

Source code in hdxms_datasets/verification.py
11
12
13
14
15
16
17
18
19
20
21
def verify_dataset(dataset: HDXDataSet, strict: bool = True):
    """Verify the integrity of the dataset by checking sequences and data files."""
    verify_peptides(dataset)
    if not datafiles_exist(dataset):
        raise ValueError("Missing datafiles")

    if dataset.file_hash is None:
        raise ValueError("Dataset file hash is not set")

    if strict:
        verify_version(dataset)

verify_peptides(dataset)

Verify that all peptide sequences match the protein sequence in the dataset states.

Source code in hdxms_datasets/verification.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def verify_peptides(dataset: HDXDataSet):
    """Verify that all peptide sequences match the protein sequence in the dataset states."""
    for state in dataset.states:
        sequence = state.protein_state.sequence
        for i, peptides in enumerate(state.peptides):
            try:
                peptide_table = peptides.load()
            except Exception as e:
                raise ValueError(f"State: {state.name}, Peptides[{i}] failed to load: {e}")
            try:
                mismatches = verify_sequence(
                    peptide_table, sequence, n_term=state.protein_state.n_term
                )
            except IndexError as e:
                raise IndexError(f"State: {state.name}, Peptides[{i}] has an index error: {e}")
            if mismatches:
                raise ValueError(
                    f"State: {state.name}, Peptides[{i}] does not match protein sequence, mismatches: {mismatches}"
                )

verify_version(dataset)

Verify that the dataset was created with a pep 440 compliant version.

Source code in hdxms_datasets/verification.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
def verify_version(dataset: HDXDataSet):
    """Verify that the dataset was created with a pep 440 compliant version."""
    ver_str = dataset.metadata.package_version

    v = Version(ver_str)  # type: ignore

    if v.pre:
        raise ValueError(
            f"A pre-release version of `hdxms-datasets` was used to create this dataset: {ver_str}"
        )
    if v.dev:
        raise ValueError(
            f"A development version of `hdxms-datasets` was used to create this dataset: {ver_str}"
        )
    if v.local:
        raise ValueError(
            f"A local version of `hdxms-datasets` was used to create this dataset: {ver_str}"
        )