Skip to content

utils

check_sequence(peptides, known_sequence, n_term=1)

Check the sequence of peptides against the given sequence.

Parameters:

Name Type Description Default
peptides DataFrame

DataFrame containing peptide information.

required
sequence

The original sequence to check against.

required
n_term int

The number of N-terminal residues to consider.

1

Returns:

Type Description
list[tuple[int, str, str]]

A tuple containing the fixed sequence and a list of mismatches.

Source code in hdxms_datasets/utils.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
def check_sequence(
    peptides: nw.DataFrame, known_sequence: str, n_term: int = 1
) -> list[tuple[int, str, str]]:
    """
    Check the sequence of peptides against the given sequence.

    Args:
        peptides: DataFrame containing peptide information.
        sequence: The original sequence to check against.
        n_term: The number of N-terminal residues to consider.

    Returns:
        A tuple containing the fixed sequence and a list of mismatches.
    """

    reconstructed_sequence = reconstruct_sequence(peptides, known_sequence, n_term)

    mismatches = []
    for r_number, (expected, found) in enumerate(
        zip(known_sequence, reconstructed_sequence), start=n_term
    ):
        if expected != found:
            mismatches.append((r_number, expected, found))

    return mismatches

default_protein_info(peptides)

Generate minimal protein info from a set of peptides

Source code in hdxms_datasets/utils.py
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def default_protein_info(peptides: nw.DataFrame) -> ProteinInfo:
    """Generate minimal protein info from a set of peptides"""
    # Start with partially deuterated peptides as they're most likely to be present

    # Find minimum start and maximum end positions
    min_start = peptides["start"].min()
    max_end = peptides["end"].max()

    # Estimate sequence length
    sequence_length = max_end - min_start + 1

    placeholder_sequence = "X" * sequence_length
    sequence = reconstruct_sequence(peptides, placeholder_sequence, n_term=min_start)

    # Create a minimal ProteinInfo
    return {
        "sequence": sequence,  # sequence with "X" gaps
        "n_term": int(min_start),
        "c_term": int(max_end),
    }

reconstruct_sequence(peptides, known_sequence, n_term=1)

Reconstruct the sequence form a dataframe of peptides with sequence information. The sequence is reconstructed by replacing the known sequence with the peptide sequences at the specified start and end positions.

Parameters:

Name Type Description Default
peptides DataFrame

DataFrame containing peptide information.

required
known_sequence str

Starting sequence. Can be a string 'X' as placeholder.

required
n_term int

The residue number of the N-terminal residue. This is typically 1, can be negative in case of purification tags.

1

Returns:

Type Description
str

The reconstructed sequence.

Source code in hdxms_datasets/utils.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
def reconstruct_sequence(peptides: nw.DataFrame, known_sequence: str, n_term: int = 1) -> str:
    """
    Reconstruct the sequence form a dataframe of peptides with sequence information.
    The sequence is reconstructed by replacing the known sequence with the peptide
    sequences at the specified start and end positions.

    Args:
        peptides: DataFrame containing peptide information.
        known_sequence: Starting sequence. Can be a string 'X' as placeholder.
        n_term: The residue number of the N-terminal residue. This is typically 1, can be
            negative in case of purification tags.

    Returns:
        The reconstructed sequence.
    """

    reconstructed = list(known_sequence)
    for start, end, sequence in peptides.select(["start", "end", "sequence"]).iter_rows():  # type: ignore
        start_idx = start - n_term
        assert end - start + 1 == len(sequence), (
            f"Length mismatch at {start}:{end} with sequence {sequence}"
        )

        for i, aa in enumerate(sequence, start=start_idx):
            reconstructed[i] = aa

    return "".join(reconstructed)