Skip to content

utils

contiguous_peptides(df)

Given a dataframe with 'start' and 'end' columns, each describing a range, (inclusive intervals), this function returns a list of tuples representing contiguous regions.

Source code in hdxms_datasets/utils.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
@nw.narwhalify
def contiguous_peptides(df: IntoFrame) -> list[tuple[int, int]]:
    """
    Given a dataframe with 'start' and 'end' columns, each describing a range,
    (inclusive intervals), this function returns a list of tuples
    representing contiguous regions.
    """
    # cast to ensure df is a narwhals DataFrame
    df = cast(nw.DataFrame, df).select(["start", "end"]).unique().sort(by=["start", "end"])

    regions = []
    current_start, current_end = None, 0

    for start_val, end_val in df.select([nw.col("start"), nw.col("end")]).iter_rows(named=False):
        if current_start is None:
            # Initialize the first region
            current_start, current_end = start_val, end_val
        elif start_val <= current_end + 1:  # Check for contiguity
            # Extend the current region
            current_end = max(current_end, end_val)
        else:
            # Save the previous region and start a new one
            regions.append((current_start, current_end))
            current_start, current_end = start_val, end_val

    # Don't forget to add the last region
    if current_start is not None:
        regions.append((current_start, current_end))

    return regions

get_peptides_by_type(peptides, deuteration_type)

Get peptides of a specific deuteration type.

Source code in hdxms_datasets/utils.py
187
188
189
190
191
192
193
194
195
196
def get_peptides_by_type(
    peptides: list[Peptides], deuteration_type: DeuterationType
) -> Optional[Peptides]:
    """Get peptides of a specific deuteration type."""
    matching_peptides = [p for p in peptides if p.deuteration_type == deuteration_type]
    if not matching_peptides:
        return None
    if len(matching_peptides) > 1:
        return None
    return matching_peptides[0]

non_overlapping_peptides(df)

Given a dataframe with 'start' and 'end' columns, each describing a range, (inclusive intervals), this function returns a list of tuples representing non-overlapping peptides.

Source code in hdxms_datasets/utils.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
@nw.narwhalify
def non_overlapping_peptides(
    df: IntoFrame,
) -> list[tuple[int, int]]:
    """
    Given a dataframe with 'start' and 'end' columns, each describing a range,
    (inclusive intervals), this function returns a list of tuples
    representing non-overlapping peptides.
    """
    df = cast(nw.DataFrame, df).select(["start", "end"]).unique().sort(by=["start", "end"])

    regions = df.rows()
    out = [regions[0]]
    for start_val, end_val in regions[1:]:
        if start_val > out[-1][1]:
            out.append((start_val, end_val))
        else:
            continue

    return out

peptide_redundancy(df)

Compute the redundancy of peptides in a DataFrame based on their start and end positions. Redundancy is defined as the number of peptides overlapping at each position.

Parameters:

Name Type Description Default
df IntoFrame

DataFrame containing peptide information with 'start' and 'end' columns.

required
start

Column name for the start position.

required
end

Column name for the end position.

required

Returns:

Type Description
ndarray

A tuple containing:

ndarray
  • r_number: An array of positions from the minimum start to the maximum end.
tuple[ndarray, ndarray]
  • redundancy: An array of redundancy counts for each position in r_number.
Source code in hdxms_datasets/utils.py
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
@nw.narwhalify
def peptide_redundancy(df: IntoFrame) -> tuple[np.ndarray, np.ndarray]:
    """
    Compute the redundancy of peptides in a DataFrame based on their start and end positions.
    Redundancy is defined as the number of peptides overlapping at each position.

    Args:
        df: DataFrame containing peptide information with 'start' and 'end' columns.
        start: Column name for the start position.
        end: Column name for the end position.

    Returns:
        A tuple containing:
        - r_number: An array of positions from the minimum start to the maximum end.
        - redundancy: An array of redundancy counts for each position in r_number.

    """
    df = cast(nw.DataFrame, df).select(["start", "end"]).unique().sort(by=["start", "end"])
    vmin, vmax = df["start"][0], df["end"][-1]

    r_number = np.arange(vmin, vmax + 1, dtype=int)
    redundancy = np.zeros_like(r_number, dtype=int)
    for s, e in df.rows():
        i0, i1 = np.searchsorted(r_number, (s, e))
        redundancy[i0:i1] += 1

    return r_number, redundancy

peptides_are_unique(peptides_df)

Check if the peptides in the dataframe are unique.

Source code in hdxms_datasets/utils.py
199
200
201
202
def peptides_are_unique(peptides_df: nw.DataFrame) -> bool:
    """Check if the peptides in the dataframe are unique."""
    unique_peptides = peptides_df.select(["start", "end"]).unique()
    return len(unique_peptides) == len(peptides_df)

reconstruct_sequence(peptides, known_sequence, n_term=1, start='start', end='end', sequence='sequence')

Reconstruct the sequence form a dataframe of peptides with sequence information. The sequence is reconstructed by replacing the known sequence with the peptide sequences at the specified start and end positions.

Parameters:

Name Type Description Default
peptides DataFrame

DataFrame containing peptide information.

required
known_sequence str

Starting sequence. Can be a string 'X' as placeholder.

required
n_term int

The residue number of the N-terminal residue. This is typically 1, can be negative in case of purification tags.

1
start

Column name for the start position of the peptide.

'start'
end

Column name for the end position of the peptide.

'end'
sequence

Column name for the peptide sequence.

'sequence'

Returns:

Type Description
str

The reconstructed sequence.

Source code in hdxms_datasets/utils.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
@nw.narwhalify
def reconstruct_sequence(
    peptides: nw.DataFrame,
    known_sequence: str,
    n_term: int = 1,
    start="start",
    end="end",
    sequence="sequence",
) -> str:
    """
    Reconstruct the sequence form a dataframe of peptides with sequence information.
    The sequence is reconstructed by replacing the known sequence with the peptide
    sequences at the specified start and end positions.

    Args:
        peptides: DataFrame containing peptide information.
        known_sequence: Starting sequence. Can be a string 'X' as placeholder.
        n_term: The residue number of the N-terminal residue. This is typically 1, can be
            negative in case of purification tags.
        start: Column name for the start position of the peptide.
        end: Column name for the end position of the peptide.
        sequence: Column name for the peptide sequence.

    Returns:
        The reconstructed sequence.
    """

    reconstructed = list(known_sequence)
    for start_, end_, sequence_ in peptides.select([start, end, sequence]).iter_rows():  # type: ignore
        start_idx = start_ - n_term
        assert end_ - start_ + 1 == len(sequence_), (
            f"Length mismatch at {start_}:{end_} with sequence {sequence_}"
        )

        for i, aa in enumerate(sequence_, start=start_idx):
            reconstructed[i] = aa

    return "".join(reconstructed)

records_to_dict(records)

Convert a list of records to a dictionary.

Source code in hdxms_datasets/utils.py
17
18
19
20
21
22
23
24
25
26
def records_to_dict(records: list[dict[str, Any]]) -> dict[str, Any]:
    """
    Convert a list of records to a dictionary.
    """
    output = defaultdict(list)
    for record in records:
        for key, value in record.items():
            output[key].append(value)

    return dict(output)

verify_sequence(peptides, known_sequence, n_term=1, start='start', end='end', sequence='sequence')

Verify the sequence of peptides against the given sequence.

Parameters:

Name Type Description Default
peptides IntoFrame

DataFrame containing peptide information.

required
sequence

The original sequence to check against.

'sequence'
n_term int

The number of N-terminal residues to consider.

1

Returns:

Type Description
list[tuple[int, str, str]]

A tuple containing the fixed sequence and a list of mismatches.

Source code in hdxms_datasets/utils.py
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
@nw.narwhalify
def verify_sequence(
    peptides: IntoFrame,
    known_sequence: str,
    n_term: int = 1,
    start="start",
    end="end",
    sequence="sequence",
) -> list[tuple[int, str, str]]:
    """
    Verify the sequence of peptides against the given sequence.

    Args:
        peptides: DataFrame containing peptide information.
        sequence: The original sequence to check against.
        n_term: The number of N-terminal residues to consider.

    Returns:
        A tuple containing the fixed sequence and a list of mismatches.
    """

    reconstructed_sequence = reconstruct_sequence(
        peptides, known_sequence, n_term, start=start, end=end, sequence=sequence
    )

    mismatches = []
    for r_number, (expected, found) in enumerate(
        zip(known_sequence, reconstructed_sequence), start=n_term
    ):
        if expected != found:
            mismatches.append((r_number, expected, found))

    return mismatches