Skip to content

verification

build_structure_peptides_comparison(structure, peptides)

Compares residue numbering and identity between a structure and peptides.

dictionary with:

Type Description
DataFrame
  • total_residues: Total number of residues in the peptides (considering chains)
DataFrame
  • matched_residues: Number of residues that are matched to the structure (by chain and resi)
DataFrame
  • identical_residues: Number of matched residues that have the correct amino acid identity
Source code in hdxms_datasets/verification.py
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
def build_structure_peptides_comparison(
    structure: Structure,
    peptides: Peptides,
) -> pl.DataFrame:
    """
    Compares residue numbering and identity between a structure and peptides.

    Returns:  dictionary with:
        - total_residues: Total number of residues in the peptides (considering chains)
        - matched_residues: Number of residues that are matched to the structure (by chain and resi)
        - identical_residues: Number of matched residues that have the correct amino acid identity

    """
    structure_df = residue_df_from_structure(structure)
    residue_df = residue_df_from_peptides(peptides)

    chains = (
        peptides.chain if peptides.chain is not None else structure_df["chain"].unique().to_list()
    )

    # supplement the residue_df with all chains
    # multie-chain peptides are expected to correspond to homomultimers
    import polars as pl

    residue_df_chain = pl.concat(
        [residue_df.with_columns(pl.lit(ch).alias("chain")) for ch in chains]
    )

    merged = residue_df_chain.join(
        structure_df,
        on=["chain", "resi"],
        how="left",
    )

    return merged

compare_structure_peptides(structure, peptides, returns='dict')

compare_structure_peptides(
    structure: Structure,
    peptides: Peptides,
    returns: Literal["dict"] = "dict",
) -> CompareSummary
compare_structure_peptides(
    structure: Structure,
    peptides: Peptides,
    returns: Literal["df"],
) -> pl.DataFrame
compare_structure_peptides(
    structure: Structure,
    peptides: Peptides,
    returns: Literal["both"],
) -> tuple[CompareSummary, pl.DataFrame]

Compares structure and peptide data.

Source code in hdxms_datasets/verification.py
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
def compare_structure_peptides(
    structure: Structure,
    peptides: Peptides,
    returns: Literal["dict", "df", "both"] = "dict",
) -> pl.DataFrame | CompareSummary | tuple[CompareSummary, pl.DataFrame]:
    """Compares structure and peptide data."""

    df = build_structure_peptides_comparison(structure, peptides)

    if returns == "dict":
        return summarize_compare_table(df)
    elif returns == "df":
        return df
    elif returns == "both":
        return summarize_compare_table(df), df
    else:
        raise ValueError(f"Invalid returns value: {returns!r}")

datafiles_exist(dataset)

Check if the data files for all peptides and structures in the dataset exist.

Source code in hdxms_datasets/verification.py
38
39
40
41
42
43
44
45
46
47
48
def datafiles_exist(dataset: HDXDataSet) -> bool:
    """
    Check if the data files for all peptides and structures in the dataset exist.
    """
    for state in dataset.states:
        for peptides in state.peptides:
            if not peptides.data_file.exists():
                return False
    if not dataset.structure.data_file.exists():
        return False
    return True

residue_df_from_peptides(peptides)

Create a dataframe from the peptides with resi, resn_TLA

Source code in hdxms_datasets/verification.py
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
def residue_df_from_peptides(peptides: Peptides) -> pl.DataFrame:
    """Create a dataframe from the peptides with resi, resn_TLA"""
    from Bio.Data import IUPACData
    import polars as pl

    one_to_three = IUPACData.protein_letters_1to3

    peptide_df = peptides.load().to_native()

    start, end = peptide_df["start"].min(), peptide_df["end"].max()
    residues = range(start, end + 1)
    known_sequence = "X" * len(residues)
    sequence = reconstruct_sequence(peptide_df, known_sequence, n_term=start)

    residue_df = (
        pl.DataFrame({"resi": residues, "resn": list(sequence)})
        .filter(pl.col("resn") != "X")
        .with_columns(
            [
                pl.col("resi").cast(str),
                pl.col("resn").replace_strict(one_to_three).str.to_uppercase().alias("resn_TLA"),
            ]
        )
    )

    return residue_df

residue_df_from_structure(structure)

Create a dataframe from the structure with chain, resi, resn_TLA

Source code in hdxms_datasets/verification.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def residue_df_from_structure(structure: Structure) -> pl.DataFrame:
    """Create a dataframe from the structure with chain, resi, resn_TLA"""
    if structure.format not in ["cif", "mmcif"]:
        raise ValueError(f"Unsupported structure format: {structure.format}")

    # create the reference structure amino acid dataframe
    from Bio.PDB.MMCIF2Dict import MMCIF2Dict
    from Bio.Data import IUPACData
    import polars as pl

    mm = MMCIF2Dict(structure.data_file)

    one_to_three = IUPACData.protein_letters_1to3
    AA_codes = [a.upper() for a in list(one_to_three.values())]

    # create a dataframe with chain, resi, resn_TLA from structure
    chain_name = "label_asym_id" if not structure.auth_chain_labels else "auth_asym_id"
    resn_name = "label_seq_id" if not structure.auth_residue_numbers else "auth_seq_id"

    structure_df = (
        pl.DataFrame(
            {
                "chain": mm["_atom_site." + chain_name],
                "resi": mm["_atom_site." + resn_name],
                "resn_TLA": [s.upper() for s in mm["_atom_site.label_comp_id"]],
            }
        )
        .unique()
        .filter(pl.col("resn_TLA").is_in(AA_codes))
    )

    return structure_df

summarize_compare_table(df)

Derive the metrics from the merged table.

Source code in hdxms_datasets/verification.py
156
157
158
159
160
161
162
163
164
165
166
167
168
def summarize_compare_table(df: pl.DataFrame) -> CompareSummary:
    """Derive the metrics from the merged table."""
    # rows with a structure match (adjust column names to your schema)
    import polars as pl

    matched = df.drop_nulls(subset=["resn_TLA_right"])
    identical = matched.filter(pl.col("resn_TLA") == pl.col("resn_TLA_right"))

    return {
        "total_residues": df.height,
        "matched_residues": matched.height,
        "identical_residues": identical.height,
    }

verify_dataset(dataset)

Verify the integrity of the dataset by checking sequences and data files.

Source code in hdxms_datasets/verification.py
10
11
12
13
14
15
16
17
def verify_dataset(dataset: HDXDataSet):
    """Verify the integrity of the dataset by checking sequences and data files."""
    verify_peptides(dataset)
    if not datafiles_exist(dataset):
        raise ValueError("Missing datafiles")

    if dataset.file_hash is None:
        raise ValueError("Dataset file hash is not set")

verify_peptides(dataset)

Verify that all peptide sequences match the protein sequence in the dataset states.

Source code in hdxms_datasets/verification.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
def verify_peptides(dataset: HDXDataSet):
    """Verify that all peptide sequences match the protein sequence in the dataset states."""
    for state in dataset.states:
        sequence = state.protein_state.sequence
        for i, peptides in enumerate(state.peptides):
            peptide_table = peptides.load()
            try:
                mismatches = verify_sequence(
                    peptide_table, sequence, n_term=state.protein_state.n_term
                )
            except IndexError as e:
                raise IndexError(f"State: {state.name}, Peptides[{i}] has an index error: {e}")
            if mismatches:
                raise ValueError(
                    f"State: {state.name}, Peptides[{i}] does not match protein sequence, mismatches: {mismatches}"
                )