Skip to content

utils

contiguous_peptides(df, start='start', end='end')

Given a dataframe with 'start' and 'end' columns, each describing a range, (inclusive intervals), this function returns a list of tuples representing contiguous regions.

Source code in hdxms_datasets/stable/v020/utils.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
@nw.narwhalify
def contiguous_peptides(df: IntoFrame, start="start", end="end") -> list[tuple[int, int]]:
    """
    Given a dataframe with 'start' and 'end' columns, each describing a range,
    (inclusive intervals), this function returns a list of tuples
    representing contiguous regions.
    """
    # cast to ensure df is a narwhals DataFrame
    df = cast(nw.DataFrame, df).select([start, end]).unique().sort(by=[start, end])

    regions = []
    current_start, current_end = None, 0

    for start_val, end_val in df.select([nw.col(start), nw.col(end)]).iter_rows(named=False):
        if current_start is None:
            # Initialize the first region
            current_start, current_end = start_val, end_val
        elif start_val <= current_end + 1:  # Check for contiguity
            # Extend the current region
            current_end = max(current_end, end_val)
        else:
            # Save the previous region and start a new one
            regions.append((current_start, current_end))
            current_start, current_end = start_val, end_val

    # Don't forget to add the last region
    if current_start is not None:
        regions.append((current_start, current_end))

    return regions

default_protein_info(peptides)

Generate minimal protein info from a set of peptides

Source code in hdxms_datasets/stable/v020/utils.py
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
@nw.narwhalify
def default_protein_info(peptides: IntoFrame) -> ProteinInfo:
    """Generate minimal protein info from a set of peptides"""

    # Find minimum start and maximum end positions
    min_start = peptides["start"].min()  # type: ignore
    max_end = peptides["end"].max()  # type: ignore

    # Estimate sequence length
    sequence_length = max_end - min_start + 1

    placeholder_sequence = "X" * sequence_length
    sequence = reconstruct_sequence(peptides, placeholder_sequence, n_term=min_start)

    # Create a minimal ProteinInfo
    return {
        "sequence": sequence,  # sequence with "X" gaps
        "n_term": int(min_start),
        "c_term": int(max_end),
    }

non_overlapping_peptides(df, start='start', end='end')

Given a dataframe with 'start' and 'end' columns, each describing a range, (inclusive intervals), this function returns a list of tuples representing non-overlapping peptides.

Source code in hdxms_datasets/stable/v020/utils.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
@nw.narwhalify
def non_overlapping_peptides(
    df: IntoFrame,
    start: str = "start",
    end: str = "end",
) -> list[tuple[int, int]]:
    """
    Given a dataframe with 'start' and 'end' columns, each describing a range,
    (inclusive intervals), this function returns a list of tuples
    representing non-overlapping peptides.
    """
    df = cast(nw.DataFrame, df).select([start, end]).unique().sort(by=[start, end])

    regions = df.rows()
    out = [regions[0]]
    for start_val, end_val in regions[1:]:
        if start_val > out[-1][1]:
            out.append((start_val, end_val))
        else:
            continue

    return out

peptide_redundancy(df, start='start', end='end')

Compute the redundancy of peptides in a DataFrame based on their start and end positions. Redundancy is defined as the number of peptides overlapping at each position.

Parameters:

Name Type Description Default
df IntoFrame

DataFrame containing peptide information with 'start' and 'end' columns.

required
start str

Column name for the start position.

'start'
end str

Column name for the end position.

'end'

Returns:

Type Description
ndarray

A tuple containing:

ndarray
  • r_number: An array of positions from the minimum start to the maximum end.
tuple[ndarray, ndarray]
  • redundancy: An array of redundancy counts for each position in r_number.
Source code in hdxms_datasets/stable/v020/utils.py
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
@nw.narwhalify
def peptide_redundancy(
    df: IntoFrame, start: str = "start", end: str = "end"
) -> tuple[np.ndarray, np.ndarray]:
    """
    Compute the redundancy of peptides in a DataFrame based on their start and end positions.
    Redundancy is defined as the number of peptides overlapping at each position.

    Args:
        df: DataFrame containing peptide information with 'start' and 'end' columns.
        start: Column name for the start position.
        end: Column name for the end position.

    Returns:
        A tuple containing:
        - r_number: An array of positions from the minimum start to the maximum end.
        - redundancy: An array of redundancy counts for each position in r_number.

    """
    df = cast(nw.DataFrame, df).select([start, end]).unique().sort(by=[start, end])
    vmin, vmax = df[start][0], df[end][-1]

    r_number = np.arange(vmin, vmax + 1, dtype=int)
    redundancy = np.zeros_like(r_number, dtype=int)
    for s, e in df.rows():
        i0, i1 = np.searchsorted(r_number, (s, e))
        redundancy[i0:i1] += 1

    return r_number, redundancy

reconstruct_sequence(peptides, known_sequence, n_term=1)

Reconstruct the sequence form a dataframe of peptides with sequence information. The sequence is reconstructed by replacing the known sequence with the peptide sequences at the specified start and end positions.

Parameters:

Name Type Description Default
peptides DataFrame

DataFrame containing peptide information.

required
known_sequence str

Starting sequence. Can be a string 'X' as placeholder.

required
n_term int

The residue number of the N-terminal residue. This is typically 1, can be negative in case of purification tags.

1

Returns:

Type Description
str

The reconstructed sequence.

Source code in hdxms_datasets/stable/v020/utils.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
@nw.narwhalify
def reconstruct_sequence(peptides: nw.DataFrame, known_sequence: str, n_term: int = 1) -> str:
    """
    Reconstruct the sequence form a dataframe of peptides with sequence information.
    The sequence is reconstructed by replacing the known sequence with the peptide
    sequences at the specified start and end positions.

    Args:
        peptides: DataFrame containing peptide information.
        known_sequence: Starting sequence. Can be a string 'X' as placeholder.
        n_term: The residue number of the N-terminal residue. This is typically 1, can be
            negative in case of purification tags.

    Returns:
        The reconstructed sequence.
    """

    reconstructed = list(known_sequence)
    for start, end, sequence in peptides.select(["start", "end", "sequence"]).iter_rows():  # type: ignore
        start_idx = start - n_term
        assert end - start + 1 == len(sequence), (
            f"Length mismatch at {start}:{end} with sequence {sequence}"
        )

        for i, aa in enumerate(sequence, start=start_idx):
            reconstructed[i] = aa

    return "".join(reconstructed)

verify_sequence(peptides, known_sequence, n_term=1)

Verify the sequence of peptides against the given sequence.

Parameters:

Name Type Description Default
peptides IntoFrame

DataFrame containing peptide information.

required
sequence

The original sequence to check against.

required
n_term int

The number of N-terminal residues to consider.

1

Returns:

Type Description
list[tuple[int, str, str]]

A tuple containing the fixed sequence and a list of mismatches.

Source code in hdxms_datasets/stable/v020/utils.py
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
@nw.narwhalify
def verify_sequence(
    peptides: IntoFrame, known_sequence: str, n_term: int = 1
) -> list[tuple[int, str, str]]:
    """
    Verify the sequence of peptides against the given sequence.

    Args:
        peptides: DataFrame containing peptide information.
        sequence: The original sequence to check against.
        n_term: The number of N-terminal residues to consider.

    Returns:
        A tuple containing the fixed sequence and a list of mismatches.
    """

    reconstructed_sequence = reconstruct_sequence(peptides, known_sequence, n_term)

    mismatches = []
    for r_number, (expected, found) in enumerate(
        zip(known_sequence, reconstructed_sequence), start=n_term
    ):
        if expected != found:
            mismatches.append((r_number, expected, found))

    return mismatches