From 8a9f8bdedfb1dc4c9b797e06546cab5825387d1d Mon Sep 17 00:00:00 2001 From: SexyERIC0723 Date: Sun, 5 Apr 2026 16:23:38 +0100 Subject: [PATCH 1/2] feat: add ClustalW .aln format support for sequence alignments Closes biotite-dev/biotite#774 --- src/biotite/sequence/io/clustal/__init__.py | 21 +++ src/biotite/sequence/io/clustal/convert.py | 113 ++++++++++++++ src/biotite/sequence/io/clustal/file.py | 162 ++++++++++++++++++++ tests/sequence/data/clustal.aln | 6 + tests/sequence/data/clustal_multi.aln | 9 ++ tests/sequence/test_clustal.py | 121 +++++++++++++++ 6 files changed, 432 insertions(+) create mode 100644 src/biotite/sequence/io/clustal/__init__.py create mode 100644 src/biotite/sequence/io/clustal/convert.py create mode 100644 src/biotite/sequence/io/clustal/file.py create mode 100644 tests/sequence/data/clustal.aln create mode 100644 tests/sequence/data/clustal_multi.aln create mode 100644 tests/sequence/test_clustal.py diff --git a/src/biotite/sequence/io/clustal/__init__.py b/src/biotite/sequence/io/clustal/__init__.py new file mode 100644 index 000000000..4fe006797 --- /dev/null +++ b/src/biotite/sequence/io/clustal/__init__.py @@ -0,0 +1,21 @@ +# This source code is part of the Biotite package and is distributed +# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further +# information. + +""" +This subpackage is used for reading and writing sequence alignment objects +using the ClustalW format (``.aln``). + +This package contains the :class:`ClustalFile`, which provides a +dictionary-like interface to ClustalW files, where sequence names are +keys and gapped sequence strings are the corresponding values. + +Furthermore, the package contains convenience functions for +getting/setting directly :class:`Alignment` objects, rather than strings. +""" + +__name__ = "biotite.sequence.io.clustal" +__author__ = "Biotite contributors" + +from .convert import * +from .file import * diff --git a/src/biotite/sequence/io/clustal/convert.py b/src/biotite/sequence/io/clustal/convert.py new file mode 100644 index 000000000..5ddb84e3e --- /dev/null +++ b/src/biotite/sequence/io/clustal/convert.py @@ -0,0 +1,113 @@ +# This source code is part of the Biotite package and is distributed +# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further +# information. + +__name__ = "biotite.sequence.io.clustal" +__author__ = "Biotite contributors" + +import functools +import warnings +from collections import OrderedDict +from biotite.sequence.align.alignment import Alignment +from biotite.sequence.alphabet import AlphabetError, LetterAlphabet +from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence + +__all__ = ["get_alignment", "set_alignment"] + + +def get_alignment(clustal_file, seq_type=None): + """ + Get an alignment from a :class:`ClustalFile` instance. + + Parameters + ---------- + clustal_file : ClustalFile + The :class:`ClustalFile` to be accessed. + seq_type : type[Sequence], optional + The :class:`Sequence` subclass contained in the file. + If not set, the type is automatically inferred as + :class:`ProteinSequence` or :class:`NucleotideSequence`. + For large sequence data it is recommended to set this parameter. + + Returns + ------- + alignment : Alignment + The alignment from the :class:`ClustalFile`. + """ + seq_strings = list(clustal_file.values()) + return Alignment.from_strings( + seq_strings, + functools.partial(_convert_to_sequence, seq_type=seq_type), + ) + + +def set_alignment(clustal_file, alignment, seq_names, line_length=60): + """ + Fill a :class:`ClustalFile` with gapped sequence strings from an + alignment. + + Parameters + ---------- + clustal_file : ClustalFile + The :class:`ClustalFile` to be accessed. + alignment : Alignment + The alignment to be set. + seq_names : iterable object of str + The names for the sequences in the alignment. + Must have the same length as the sequence count in `alignment`. + line_length : int, optional + The number of sequence characters per line in each block. + Default is 60. + """ + gapped_seq_strings = alignment.get_gapped_sequences() + if len(gapped_seq_strings) != len(seq_names): + raise ValueError( + f"Alignment has {len(gapped_seq_strings)} sequences, " + f"but {len(seq_names)} names were given" + ) + # Store sequences in internal dict + for name, seq_str in zip(seq_names, gapped_seq_strings): + clustal_file._entries[name] = seq_str + # Rebuild the text lines with the specified line length + clustal_file._rebuild_lines(line_length=line_length) + + +def _convert_to_sequence(seq_str, seq_type=None): + if seq_type is not None: + if seq_type == NucleotideSequence: + return _convert_to_nucleotide(seq_str) + elif seq_type == ProteinSequence: + if "U" in seq_str: + warnings.warn( + "ProteinSequence objects do not support selenocysteine " + "(U), occurrences were substituted by cysteine (C)" + ) + return _convert_to_protein(seq_str) + else: + return seq_type(seq_str) + + try: + return _convert_to_nucleotide(seq_str) + except AlphabetError: + pass + try: + prot_seq = _convert_to_protein(seq_str) + if "U" in seq_str: + warnings.warn( + "ProteinSequence objects do not support selenocysteine (U), " + "occurrences were substituted by cysteine (C)" + ) + return prot_seq + except AlphabetError: + raise ValueError( + "ClustalW data cannot be converted either to " + "'NucleotideSequence' nor to 'ProteinSequence'" + ) + + +def _convert_to_protein(seq_str): + return ProteinSequence(seq_str.upper().replace("U", "C").replace("O", "K")) + + +def _convert_to_nucleotide(seq_str): + return NucleotideSequence(seq_str.upper().replace("U", "T").replace("X", "N")) diff --git a/src/biotite/sequence/io/clustal/file.py b/src/biotite/sequence/io/clustal/file.py new file mode 100644 index 000000000..e0badf822 --- /dev/null +++ b/src/biotite/sequence/io/clustal/file.py @@ -0,0 +1,162 @@ +# This source code is part of the Biotite package and is distributed +# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further +# information. + +__name__ = "biotite.sequence.io.clustal" +__author__ = "Biotite contributors" +__all__ = ["ClustalFile"] + +from collections import OrderedDict +from collections.abc import MutableMapping +from biotite.file import InvalidFileError, TextFile + + +_CONSENSUS_CHARS = frozenset(" *:.") + + +class ClustalFile(TextFile, MutableMapping): + """ + This class represents a file in ClustalW format (``.aln``). + + A ClustalW file starts with a header line beginning with ``CLUSTAL``, + followed by blocks of aligned sequences separated by blank lines. + Each sequence line contains a sequence name followed by a segment of + the gapped sequence. + An optional consensus line, containing only ``*``, ``:``, ``.`` and + space characters, may follow each block. + + This class is used in a dictionary-like manner, implementing the + :class:`MutableMapping` interface: + Sequence names are used as keys, + and strings containing the full gapped sequences are the + corresponding values. + + Examples + -------- + + >>> import os.path + >>> file = ClustalFile() + >>> file["seq1"] = "ADTRCGTARDCGTR-DRTCGRAGD" + >>> file["seq2"] = "ADTRCGT---CGTRADRTCGRAGD" + >>> print(file["seq1"]) + ADTRCGTARDCGTR-DRTCGRAGD + >>> for name, seq in file.items(): + ... print(name, seq) + seq1 ADTRCGTARDCGTR-DRTCGRAGD + seq2 ADTRCGT---CGTRADRTCGRAGD + """ + + def __init__(self): + super().__init__() + self._entries = OrderedDict() + + @classmethod + def read(cls, file): + """ + Read a ClustalW file. + + Parameters + ---------- + file : file-like object or str + The file to be read. + Alternatively a file path can be supplied. + + Returns + ------- + file_object : ClustalFile + The parsed file. + """ + file = super().read(file) + if len(file.lines) == 0: + raise InvalidFileError("File is empty") + file._parse() + return file + + def _parse(self): + """ + Parse the lines into sequence entries. + """ + # Validate header + if not self.lines[0].startswith("CLUSTAL"): + raise InvalidFileError( + f"File does not start with 'CLUSTAL' header, " + f"got '{self.lines[0][:20]}...'" + ) + + self._entries = OrderedDict() + for line in self.lines[1:]: + stripped = line.strip() + # Skip blank lines + if len(stripped) == 0: + continue + # Skip consensus lines (lines that start with whitespace + # and contain only consensus characters) + if line[0] == " " and all(c in _CONSENSUS_CHARS for c in line): + continue + # Sequence lines: name followed by whitespace and sequence + # The name must start at the beginning of the line + # (no leading whitespace) + if line[0] != " ": + parts = line.split() + if len(parts) >= 2: + name = parts[0] + seq_segment = parts[1] + if name in self._entries: + self._entries[name] += seq_segment + else: + self._entries[name] = seq_segment + + def __setitem__(self, name, seq_str): + if not isinstance(name, str): + raise IndexError( + "'ClustalFile' only supports sequence name strings as keys" + ) + if not isinstance(seq_str, str): + raise TypeError( + "'ClustalFile' only supports sequence strings as values" + ) + self._entries[name] = seq_str + self._rebuild_lines() + + def __getitem__(self, name): + if not isinstance(name, str): + raise IndexError( + "'ClustalFile' only supports sequence name strings as keys" + ) + return self._entries[name] + + def __delitem__(self, name): + del self._entries[name] + self._rebuild_lines() + + def __len__(self): + return len(self._entries) + + def __iter__(self): + return self._entries.__iter__() + + def __contains__(self, name): + return name in self._entries + + def _rebuild_lines(self, line_length=60): + """ + Rebuild the text lines from the stored entries. + """ + self.lines = ["CLUSTAL W multiple sequence alignment", ""] + + if len(self._entries) == 0: + return + + names = list(self._entries.keys()) + sequences = list(self._entries.values()) + total_len = max(len(s) for s in sequences) + # Pad name column to accommodate the longest name + spacing + name_width = max(len(n) for n in names) + + for start in range(0, total_len, line_length): + for name, seq_str in zip(names, sequences): + segment = seq_str[start : start + line_length] + if len(segment) > 0: + padding = " " * (name_width - len(name) + 6) + self.lines.append(f"{name}{padding}{segment}") + self.lines.append("") diff --git a/tests/sequence/data/clustal.aln b/tests/sequence/data/clustal.aln new file mode 100644 index 000000000..9d4296502 --- /dev/null +++ b/tests/sequence/data/clustal.aln @@ -0,0 +1,6 @@ +CLUSTAL W (1.83) multiple sequence alignment + +seq1 ADTRCGTARDCGTR-DRTCGRAGD +seq2 ADTRCGT---CGTRADRTCGRAGD +seq3 ADTRCGTARDCGTRADR--GRAGD + ******* ***** ** ***** diff --git a/tests/sequence/data/clustal_multi.aln b/tests/sequence/data/clustal_multi.aln new file mode 100644 index 000000000..19a542f38 --- /dev/null +++ b/tests/sequence/data/clustal_multi.aln @@ -0,0 +1,9 @@ +CLUSTAL W (1.83) multiple sequence alignment + +seq1 ADTRCGTARDCGTR-DRTCGRAGD +seq2 ADTRCGT---CGTRADRTCGRAGD + *** ** ********* + +seq1 ADTRCGTARDCGTR-DRTCGRAGD +seq2 ADTRCGT---CGTRADRTCGRAGD + *** ** ********* diff --git a/tests/sequence/test_clustal.py b/tests/sequence/test_clustal.py new file mode 100644 index 000000000..10428e82e --- /dev/null +++ b/tests/sequence/test_clustal.py @@ -0,0 +1,121 @@ +# This source code is part of the Biotite package and is distributed +# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further +# information. + +import io +import os +import os.path +import pytest +import biotite.sequence as seq +import biotite.sequence.io.clustal as clustal +from tests.util import data_dir + + +def test_access_low_level(): + """Test reading a ClustalW file and accessing sequences by name.""" + path = os.path.join(data_dir("sequence"), "clustal.aln") + file = clustal.ClustalFile.read(path) + assert file["seq1"] == "ADTRCGTARDCGTR-DRTCGRAGD" + assert file["seq2"] == "ADTRCGT---CGTRADRTCGRAGD" + assert file["seq3"] == "ADTRCGTARDCGTRADR--GRAGD" + assert list(file.keys()) == ["seq1", "seq2", "seq3"] + + +def test_access_multi_block(): + """Test reading a ClustalW file with multiple blocks.""" + path = os.path.join(data_dir("sequence"), "clustal_multi.aln") + file = clustal.ClustalFile.read(path) + assert file["seq1"] == ( + "ADTRCGTARDCGTR-DRTCGRAGD" + "ADTRCGTARDCGTR-DRTCGRAGD" + ) + assert file["seq2"] == ( + "ADTRCGT---CGTRADRTCGRAGD" + "ADTRCGT---CGTRADRTCGRAGD" + ) + + +def test_dict_interface(): + """Test dictionary-like interface of ClustalFile.""" + file = clustal.ClustalFile() + file["seq1"] = "ADTRCGT" + file["seq2"] = "ADTRCGT" + assert len(file) == 2 + assert "seq1" in file + assert list(file) == ["seq1", "seq2"] + + del file["seq1"] + assert len(file) == 1 + assert "seq1" not in file + + +def test_alignment_conversion(): + """Test conversion between ClustalFile and Alignment object.""" + path = os.path.join(data_dir("sequence"), "clustal.aln") + file = clustal.ClustalFile.read(path) + alignment = clustal.get_alignment(file) + assert str(alignment) == ( + "ADTRCGTARDCGTR-DRTCGRAGD\n" + "ADTRCGT---CGTRADRTCGRAGD\n" + "ADTRCGTARDCGTRADR--GRAGD" + ) # fmt: skip + + +def test_round_trip(): + """Test writing an alignment and reading it back.""" + path = os.path.join(data_dir("sequence"), "clustal.aln") + file = clustal.ClustalFile.read(path) + alignment = clustal.get_alignment(file) + + # Write alignment to a new ClustalFile + file2 = clustal.ClustalFile() + clustal.set_alignment( + file2, alignment, seq_names=["seq1", "seq2", "seq3"] + ) + alignment2 = clustal.get_alignment(file2) + assert str(alignment) == str(alignment2) + + +def test_write_read_round_trip(): + """Test that writing to a file and reading back yields the same + alignment.""" + path = os.path.join(data_dir("sequence"), "clustal.aln") + file = clustal.ClustalFile.read(path) + alignment = clustal.get_alignment(file) + + # Write to string buffer + buffer = io.StringIO() + file2 = clustal.ClustalFile() + clustal.set_alignment( + file2, alignment, seq_names=["seq1", "seq2", "seq3"] + ) + file2.write(buffer) + + # Read back from string buffer + buffer.seek(0) + file3 = clustal.ClustalFile.read(buffer) + alignment3 = clustal.get_alignment(file3) + assert str(alignment) == str(alignment3) + + +def test_name_count_mismatch(): + """Test that mismatched name count raises ValueError.""" + path = os.path.join(data_dir("sequence"), "clustal.aln") + file = clustal.ClustalFile.read(path) + alignment = clustal.get_alignment(file) + + file2 = clustal.ClustalFile() + with pytest.raises(ValueError): + clustal.set_alignment( + file2, alignment, seq_names=["seq1", "seq2"] + ) + + +@pytest.mark.parametrize("seq_type", (None, seq.ProteinSequence)) +def test_seq_type(seq_type): + """Test explicit sequence type parameter.""" + path = os.path.join(data_dir("sequence"), "clustal.aln") + file = clustal.ClustalFile.read(path) + alignment = clustal.get_alignment(file, seq_type=seq_type) + # Should produce a valid alignment regardless of type + assert len(alignment.sequences) == 3 From 83b69f9997e364e967a0c179f15584c6d03ed0ea Mon Sep 17 00:00:00 2001 From: SexyERIC0723 Date: Sun, 5 Apr 2026 16:30:57 +0100 Subject: [PATCH 2/2] fix: address edge cases in ClustalW format (Codex review findings) - Validate sequence count >= 2 in get_alignment() with clear error - Validate line_length > 0 in set_alignment() - Clear existing entries before writing in set_alignment() to prevent stale data when reusing a ClustalFile object --- src/biotite/sequence/io/clustal/convert.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/biotite/sequence/io/clustal/convert.py b/src/biotite/sequence/io/clustal/convert.py index 5ddb84e3e..613f9b800 100644 --- a/src/biotite/sequence/io/clustal/convert.py +++ b/src/biotite/sequence/io/clustal/convert.py @@ -35,6 +35,11 @@ def get_alignment(clustal_file, seq_type=None): The alignment from the :class:`ClustalFile`. """ seq_strings = list(clustal_file.values()) + if len(seq_strings) < 2: + raise ValueError( + f"ClustalW file contains {len(seq_strings)} sequence(s), " + f"but an alignment requires at least 2" + ) return Alignment.from_strings( seq_strings, functools.partial(_convert_to_sequence, seq_type=seq_type), @@ -59,13 +64,18 @@ def set_alignment(clustal_file, alignment, seq_names, line_length=60): The number of sequence characters per line in each block. Default is 60. """ + if line_length < 1: + raise ValueError( + f"'line_length' must be at least 1, got {line_length}" + ) gapped_seq_strings = alignment.get_gapped_sequences() if len(gapped_seq_strings) != len(seq_names): raise ValueError( f"Alignment has {len(gapped_seq_strings)} sequences, " f"but {len(seq_names)} names were given" ) - # Store sequences in internal dict + # Clear any existing entries before writing new data + clustal_file._entries.clear() for name, seq_str in zip(seq_names, gapped_seq_strings): clustal_file._entries[name] = seq_str # Rebuild the text lines with the specified line length