Skip to content

utils

contiguous_peptides(df)

Given a dataframe with 'start' and 'end' columns, each describing a range, (inclusive intervals), this function returns a list of tuples representing contiguous regions.

Source code in hdxms_datasets/utils.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
@nw.narwhalify
def contiguous_peptides(df: IntoFrame) -> list[tuple[int, int]]:
    """
    Given a dataframe with 'start' and 'end' columns, each describing a range,
    (inclusive intervals), this function returns a list of tuples
    representing contiguous regions.
    """
    # cast to ensure df is a narwhals DataFrame
    df = cast(nw.DataFrame, df).select(["start", "end"]).unique().sort(by=["start", "end"])

    regions = []
    current_start, current_end = None, 0

    for start_val, end_val in df.select([nw.col("start"), nw.col("end")]).iter_rows(named=False):
        if current_start is None:
            # Initialize the first region
            current_start, current_end = start_val, end_val
        elif start_val <= current_end + 1:  # Check for contiguity
            # Extend the current region
            current_end = max(current_end, end_val)
        else:
            # Save the previous region and start a new one
            regions.append((current_start, current_end))
            current_start, current_end = start_val, end_val

    # Don't forget to add the last region
    if current_start is not None:
        regions.append((current_start, current_end))

    return regions

diff_sequence(a, b)

Compute the similarity ratio between two sequences.

Source code in hdxms_datasets/utils.py
13
14
15
16
17
def diff_sequence(a: str, b: str) -> float:
    """
    Compute the similarity ratio between two sequences.
    """
    return difflib.SequenceMatcher(None, a, b).ratio()

get_peptides_by_type(peptides, deuteration_type)

Get peptides of a specific deuteration type.

Source code in hdxms_datasets/utils.py
177
178
179
180
181
182
183
184
185
186
def get_peptides_by_type(
    peptides: list[Peptides], deuteration_type: DeuterationType
) -> Optional[Peptides]:
    """Get peptides of a specific deuteration type."""
    matching_peptides = [p for p in peptides if p.deuteration_type == deuteration_type]
    if not matching_peptides:
        return None
    if len(matching_peptides) > 1:
        return None
    return matching_peptides[0]

non_overlapping_peptides(df)

Given a dataframe with 'start' and 'end' columns, each describing a range, (inclusive intervals), this function returns a list of tuples representing non-overlapping peptides.

Source code in hdxms_datasets/utils.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
@nw.narwhalify
def non_overlapping_peptides(
    df: IntoFrame,
) -> list[tuple[int, int]]:
    """
    Given a dataframe with 'start' and 'end' columns, each describing a range,
    (inclusive intervals), this function returns a list of tuples
    representing non-overlapping peptides.
    """
    df = cast(nw.DataFrame, df).select(["start", "end"]).unique().sort(by=["start", "end"])

    regions = df.rows()
    out = [regions[0]]
    for start_val, end_val in regions[1:]:
        if start_val > out[-1][1]:
            out.append((start_val, end_val))
        else:
            continue

    return out

peptide_redundancy(df)

Compute the redundancy of peptides in a DataFrame based on their start and end positions. Redundancy is defined as the number of peptides overlapping at each position.

Parameters:

Name Type Description Default
df IntoFrame

DataFrame containing peptide information with 'start' and 'end' columns.

required

Returns:

Type Description
ndarray

A tuple containing:

ndarray
  • r_number: An array of positions from the minimum start to the maximum end.
tuple[ndarray, ndarray]
  • redundancy: An array of redundancy counts for each position in r_number.
Source code in hdxms_datasets/utils.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
@nw.narwhalify
def peptide_redundancy(df: IntoFrame) -> tuple[np.ndarray, np.ndarray]:
    """
    Compute the redundancy of peptides in a DataFrame based on their start and end positions.
    Redundancy is defined as the number of peptides overlapping at each position.

    Args:
        df: DataFrame containing peptide information with 'start' and 'end' columns.

    Returns:
        A tuple containing:
        - r_number: An array of positions from the minimum start to the maximum end.
        - redundancy: An array of redundancy counts for each position in r_number.

    """
    df = cast(nw.DataFrame, df).select(["start", "end"]).unique().sort(by=["start", "end"])
    vmin, vmax = df["start"][0], df["end"][-1]

    r_number = np.arange(vmin, vmax + 1, dtype=int)
    redundancy = np.zeros_like(r_number, dtype=int)
    for s, e in df.rows():
        i0, i1 = np.searchsorted(r_number, (s, e))
        redundancy[i0:i1] += 1

    return r_number, redundancy

peptides_are_unique(peptides_df)

Check if the peptides in the dataframe are unique.

Source code in hdxms_datasets/utils.py
189
190
191
192
def peptides_are_unique(peptides_df: nw.DataFrame) -> bool:
    """Check if the peptides in the dataframe are unique."""
    unique_peptides = peptides_df.select(["start", "end"]).unique()
    return len(unique_peptides) == len(peptides_df)

reconstruct_sequence(peptides, known_sequence, n_term=1)

Reconstruct the sequence form a dataframe of peptides with sequence information. The sequence is reconstructed by replacing the known sequence with the peptide sequences at the specified start and end positions.

Parameters:

Name Type Description Default
peptides DataFrame

DataFrame containing peptide information.

required
known_sequence str

Starting sequence. Can be a string 'X' as placeholder.

required
n_term int

The residue number of the N-terminal residue. This is typically 1, can be negative in case of purification tags.

1

Returns:

Type Description
str

The reconstructed sequence.

Source code in hdxms_datasets/utils.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
@nw.narwhalify
def reconstruct_sequence(
    peptides: nw.DataFrame,
    known_sequence: str,
    n_term: int = 1,
) -> str:
    """
    Reconstruct the sequence form a dataframe of peptides with sequence information.
    The sequence is reconstructed by replacing the known sequence with the peptide
    sequences at the specified start and end positions.

    Args:
        peptides: DataFrame containing peptide information.
        known_sequence: Starting sequence. Can be a string 'X' as placeholder.
        n_term: The residue number of the N-terminal residue. This is typically 1, can be
            negative in case of purification tags.

    Returns:
        The reconstructed sequence.
    """

    reconstructed = list(known_sequence)
    for start_, end_, sequence_ in peptides.select(["start", "end", "sequence"]).iter_rows():  # type: ignore
        start_idx = start_ - n_term
        assert end_ - start_ + 1 == len(sequence_), (
            f"Length mismatch at {start_}:{end_} with sequence {sequence_}"
        )

        for i, aa in enumerate(sequence_, start=start_idx):
            reconstructed[i] = aa

    return "".join(reconstructed)

records_to_dict(records)

Convert a list of records to a dictionary.

Source code in hdxms_datasets/utils.py
20
21
22
23
24
25
26
27
28
29
def records_to_dict(records: list[dict[str, Any]]) -> dict[str, Any]:
    """
    Convert a list of records to a dictionary.
    """
    output = defaultdict(list)
    for record in records:
        for key, value in record.items():
            output[key].append(value)

    return dict(output)

verify_sequence(peptides, known_sequence, n_term=1)

Verify the sequence of peptides against the given sequence.

Parameters:

Name Type Description Default
peptides IntoFrame

DataFrame containing peptide information.

required
known_sequence str

The original sequence to check against.

required
n_term int

The number of N-terminal residues to consider.

1

Returns:

Type Description
list[tuple[int, str, str]]

A tuple containing the fixed sequence and a list of mismatches.

Source code in hdxms_datasets/utils.py
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
@nw.narwhalify
def verify_sequence(
    peptides: IntoFrame,
    known_sequence: str,
    n_term: int = 1,
) -> list[tuple[int, str, str]]:
    """
    Verify the sequence of peptides against the given sequence.

    Args:
        peptides: DataFrame containing peptide information.
        known_sequence: The original sequence to check against.
        n_term: The number of N-terminal residues to consider.

    Returns:
        A tuple containing the fixed sequence and a list of mismatches.
    """

    reconstructed_sequence = reconstruct_sequence(peptides, known_sequence, n_term)

    mismatches = []
    for r_number, (expected, found) in enumerate(
        zip(known_sequence, reconstructed_sequence), start=n_term
    ):
        if expected != found:
            mismatches.append((r_number, expected, found))

    return mismatches