Skip to content

utils

FrameSlicer

Bases: Generic[IntoFrameT]

Wrap a DataFrame and allow indexing by column values (sorted).

Example

s = FrameSlicer(df, col="exposure")
first_df = s[0] # filtered dataframe where col == first unique value
three = s[0:3] # filtered dataframe where col in first three unique values

Source code in hdxms_datasets/utils.py
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
class FrameSlicer(Generic[IntoFrameT]):
    """Wrap a DataFrame and allow indexing by column values (sorted).

    Example:
        s = FrameSlicer(df, col="exposure")
        first_df = s[0] # filtered dataframe where col == first unique value
        three = s[0:3]  # filtered dataframe where col in first three unique values
    """

    def __init__(self, df: IntoFrameT, col: str):
        self._df = nw.from_native(df)
        self._col = col

        # get unique values as Python list
        raw_vals = sorted(self._df[self._col].unique().to_list())
        self._values: list[object] = raw_vals

    def __len__(self) -> int:
        return len(self._values)

    def __iter__(self):
        for i in range(len(self)):
            yield self[i]

    def __getitem__(self, key: Union[int, slice]) -> IntoFrameT:
        slice_result = self._values[key]

        if isinstance(slice_result, list):
            return self._df.filter(nw.col(self._col).is_in(slice_result)).to_native()
        else:
            return self._df.filter(nw.col(self._col) == slice_result).to_native()

contiguous_peptides(df)

Given a dataframe with 'start' and 'end' columns, each describing a range,
(inclusive intervals), this function returns a list of tuples
representing contiguous regions.

Source code in hdxms_datasets/utils.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
@nw.narwhalify
def contiguous_peptides(df: IntoFrame) -> list[tuple[int, int]]:
    """
    Given a dataframe with 'start' and 'end' columns, each describing a range,
    (inclusive intervals), this function returns a list of tuples
    representing contiguous regions.
    """
    # cast to ensure df is a narwhals DataFrame
    df = cast(nw.DataFrame, df).select(["start", "end"]).unique().sort(by=["start", "end"])

    regions = []
    current_start, current_end = None, 0

    for start_val, end_val in df.select([nw.col("start"), nw.col("end")]).iter_rows(named=False):
        if current_start is None:
            # Initialize the first region
            current_start, current_end = start_val, end_val
        elif start_val <= current_end + 1:  # Check for contiguity
            # Extend the current region
            current_end = max(current_end, end_val)
        else:
            # Save the previous region and start a new one
            regions.append((current_start, current_end))
            current_start, current_end = start_val, end_val

    # Don't forget to add the last region
    if current_start is not None:
        regions.append((current_start, current_end))

    return regions

diff_sequence(a, b)

Compute the similarity ratio between two sequences.

Source code in hdxms_datasets/utils.py
13
14
15
16
17
def diff_sequence(a: str, b: str) -> float:
    """
    Compute the similarity ratio between two sequences.
    """
    return difflib.SequenceMatcher(None, a, b).ratio()

get_peptides_by_type(peptides, deuteration_type)

Get peptides of a specific deuteration type.

Source code in hdxms_datasets/utils.py
177
178
179
180
181
182
183
184
185
186
def get_peptides_by_type(
    peptides: list[Peptides], deuteration_type: DeuterationType
) -> Optional[Peptides]:
    """Get peptides of a specific deuteration type."""
    matching_peptides = [p for p in peptides if p.deuteration_type == deuteration_type]
    if not matching_peptides:
        return None
    if len(matching_peptides) > 1:
        return None
    return matching_peptides[0]

non_overlapping_peptides(df)

Given a dataframe with 'start' and 'end' columns, each describing a range,
(inclusive intervals), this function returns a list of tuples
representing non-overlapping peptides.

Source code in hdxms_datasets/utils.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
@nw.narwhalify
def non_overlapping_peptides(
    df: IntoFrame,
) -> list[tuple[int, int]]:
    """
    Given a dataframe with 'start' and 'end' columns, each describing a range,
    (inclusive intervals), this function returns a list of tuples
    representing non-overlapping peptides.
    """
    df = cast(nw.DataFrame, df).select(["start", "end"]).unique().sort(by=["start", "end"])

    regions = df.rows()
    out = [regions[0]]
    for start_val, end_val in regions[1:]:
        if start_val > out[-1][1]:
            out.append((start_val, end_val))
        else:
            continue

    return out

peptide_redundancy(df)

Compute the redundancy of peptides in a DataFrame based on their start and end positions.
Redundancy is defined as the number of peptides overlapping at each position.

Parameters:

Name Type Description Default
df IntoFrame

DataFrame containing peptide information with 'start' and 'end' columns.

required

Returns:

Type Description
ndarray

A tuple containing:

ndarray
  • r_number: An array of positions from the minimum start to the maximum end.
tuple[ndarray, ndarray]
  • redundancy: An array of redundancy counts for each position in r_number.
Source code in hdxms_datasets/utils.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
@nw.narwhalify
def peptide_redundancy(df: IntoFrame) -> tuple[np.ndarray, np.ndarray]:
    """
    Compute the redundancy of peptides in a DataFrame based on their start and end positions.
    Redundancy is defined as the number of peptides overlapping at each position.

    Args:
        df: DataFrame containing peptide information with 'start' and 'end' columns.

    Returns:
        A tuple containing:
        - r_number: An array of positions from the minimum start to the maximum end.
        - redundancy: An array of redundancy counts for each position in r_number.

    """
    df = cast(nw.DataFrame, df).select(["start", "end"]).unique().sort(by=["start", "end"])
    vmin, vmax = df["start"][0], df["end"][-1]

    r_number = np.arange(vmin, vmax + 1, dtype=int)
    redundancy = np.zeros_like(r_number, dtype=int)
    for s, e in df.rows():
        i0, i1 = np.searchsorted(r_number, (s, e))
        redundancy[i0:i1] += 1

    return r_number, redundancy

peptides_are_unique(peptides_df)

Check if the peptides in the dataframe are unique.

Source code in hdxms_datasets/utils.py
189
190
191
192
def peptides_are_unique(peptides_df: nw.DataFrame) -> bool:
    """Check if the peptides in the dataframe are unique."""
    unique_peptides = peptides_df.select(["start", "end"]).unique()
    return len(unique_peptides) == len(peptides_df)

reconstruct_sequence(peptides, known_sequence, n_term=1)

Reconstruct the sequence form a dataframe of peptides with sequence information.
The sequence is reconstructed by replacing the known sequence with the peptide
sequences at the specified start and end positions.

Parameters:

Name Type Description Default
peptides DataFrame

DataFrame containing peptide information.

required
known_sequence str

Starting sequence. Can be a string 'X' as placeholder.

required
n_term int

The residue number of the N-terminal residue. This is typically 1, can be
negative in case of purification tags.

1

Returns:

Type Description
str

The reconstructed sequence.

Source code in hdxms_datasets/utils.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
@nw.narwhalify
def reconstruct_sequence(
    peptides: nw.DataFrame,
    known_sequence: str,
    n_term: int = 1,
) -> str:
    """
    Reconstruct the sequence form a dataframe of peptides with sequence information.
    The sequence is reconstructed by replacing the known sequence with the peptide
    sequences at the specified start and end positions.

    Args:
        peptides: DataFrame containing peptide information.
        known_sequence: Starting sequence. Can be a string 'X' as placeholder.
        n_term: The residue number of the N-terminal residue. This is typically 1, can be
            negative in case of purification tags.

    Returns:
        The reconstructed sequence.
    """

    reconstructed = list(known_sequence)
    for start_, end_, sequence_ in peptides.select(["start", "end", "sequence"]).iter_rows():  # type: ignore
        start_idx = start_ - n_term
        assert end_ - start_ + 1 == len(sequence_), (
            f"Length mismatch at {start_}:{end_} with sequence {sequence_}"
        )

        for i, aa in enumerate(sequence_, start=start_idx):
            reconstructed[i] = aa

    return "".join(reconstructed)

records_to_dict(records)

Convert a list of records to a dictionary.

Source code in hdxms_datasets/utils.py
20
21
22
23
24
25
26
27
28
29
def records_to_dict(records: list[dict[str, Any]]) -> dict[str, Any]:
    """
    Convert a list of records to a dictionary.
    """
    output = defaultdict(list)
    for record in records:
        for key, value in record.items():
            output[key].append(value)

    return dict(output)

slice_exposure(df)

Factory returning FrameSlicer for df using column 'exposure'.

Source code in hdxms_datasets/utils.py
228
229
230
def slice_exposure(df: IntoFrameT) -> FrameSlicer[IntoFrameT]:
    """Factory returning FrameSlicer for df using column 'exposure'."""
    return FrameSlicer(df, col="exposure")

verify_sequence(peptides, known_sequence, n_term=1)

Verify the sequence of peptides against the given sequence.

Parameters:

Name Type Description Default
peptides IntoFrame

DataFrame containing peptide information.

required
known_sequence str

The original sequence to check against.

required
n_term int

The number of N-terminal residues to consider.

1

Returns:

Type Description
list[tuple[int, str, str]]

A tuple containing the fixed sequence and a list of mismatches.

Source code in hdxms_datasets/utils.py
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
@nw.narwhalify
def verify_sequence(
    peptides: IntoFrame,
    known_sequence: str,
    n_term: int = 1,
) -> list[tuple[int, str, str]]:
    """
    Verify the sequence of peptides against the given sequence.

    Args:
        peptides: DataFrame containing peptide information.
        known_sequence: The original sequence to check against.
        n_term: The number of N-terminal residues to consider.

    Returns:
        A tuple containing the fixed sequence and a list of mismatches.
    """

    reconstructed_sequence = reconstruct_sequence(peptides, known_sequence, n_term)

    mismatches = []
    for r_number, (expected, found) in enumerate(
        zip(known_sequence, reconstructed_sequence), start=n_term
    ):
        if expected != found:
            mismatches.append((r_number, expected, found))

    return mismatches