From 8a9f8bdedfb1dc4c9b797e06546cab5825387d1d Mon Sep 17 00:00:00 2001
From: SexyERIC0723 <haoyuwang144@gmail.com>
Date: Sun, 5 Apr 2026 16:23:38 +0100
Subject: [PATCH 1/2] feat: add ClustalW .aln format support for sequence
 alignments

Closes biotite-dev/biotite#774
---
 src/biotite/sequence/io/clustal/__init__.py |  21 +++
 src/biotite/sequence/io/clustal/convert.py  | 113 ++++++++++++++
 src/biotite/sequence/io/clustal/file.py     | 162 ++++++++++++++++++++
 tests/sequence/data/clustal.aln             |   6 +
 tests/sequence/data/clustal_multi.aln       |   9 ++
 tests/sequence/test_clustal.py              | 121 +++++++++++++++
 6 files changed, 432 insertions(+)
 create mode 100644 src/biotite/sequence/io/clustal/__init__.py
 create mode 100644 src/biotite/sequence/io/clustal/convert.py
 create mode 100644 src/biotite/sequence/io/clustal/file.py
 create mode 100644 tests/sequence/data/clustal.aln
 create mode 100644 tests/sequence/data/clustal_multi.aln
 create mode 100644 tests/sequence/test_clustal.py

diff --git a/src/biotite/sequence/io/clustal/__init__.py b/src/biotite/sequence/io/clustal/__init__.py
new file mode 100644
index 000000000..4fe006797
--- /dev/null
+++ b/src/biotite/sequence/io/clustal/__init__.py
@@ -0,0 +1,21 @@
+# This source code is part of the Biotite package and is distributed
+# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
+# information.
+
+"""
+This subpackage is used for reading and writing sequence alignment objects
+using the ClustalW format (``.aln``).
+
+This package contains the :class:`ClustalFile`, which provides a
+dictionary-like interface to ClustalW files, where sequence names are
+keys and gapped sequence strings are the corresponding values.
+
+Furthermore, the package contains convenience functions for
+getting/setting directly :class:`Alignment` objects, rather than strings.
+"""
+
+__name__ = "biotite.sequence.io.clustal"
+__author__ = "Biotite contributors"
+
+from .convert import *
+from .file import *
diff --git a/src/biotite/sequence/io/clustal/convert.py b/src/biotite/sequence/io/clustal/convert.py
new file mode 100644
index 000000000..5ddb84e3e
--- /dev/null
+++ b/src/biotite/sequence/io/clustal/convert.py
@@ -0,0 +1,113 @@
+# This source code is part of the Biotite package and is distributed
+# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
+# information.
+
+__name__ = "biotite.sequence.io.clustal"
+__author__ = "Biotite contributors"
+
+import functools
+import warnings
+from collections import OrderedDict
+from biotite.sequence.align.alignment import Alignment
+from biotite.sequence.alphabet import AlphabetError, LetterAlphabet
+from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence
+
+__all__ = ["get_alignment", "set_alignment"]
+
+
+def get_alignment(clustal_file, seq_type=None):
+    """
+    Get an alignment from a :class:`ClustalFile` instance.
+
+    Parameters
+    ----------
+    clustal_file : ClustalFile
+        The :class:`ClustalFile` to be accessed.
+    seq_type : type[Sequence], optional
+        The :class:`Sequence` subclass contained in the file.
+        If not set, the type is automatically inferred as
+        :class:`ProteinSequence` or :class:`NucleotideSequence`.
+        For large sequence data it is recommended to set this parameter.
+
+    Returns
+    -------
+    alignment : Alignment
+        The alignment from the :class:`ClustalFile`.
+    """
+    seq_strings = list(clustal_file.values())
+    return Alignment.from_strings(
+        seq_strings,
+        functools.partial(_convert_to_sequence, seq_type=seq_type),
+    )
+
+
+def set_alignment(clustal_file, alignment, seq_names, line_length=60):
+    """
+    Fill a :class:`ClustalFile` with gapped sequence strings from an
+    alignment.
+
+    Parameters
+    ----------
+    clustal_file : ClustalFile
+        The :class:`ClustalFile` to be accessed.
+    alignment : Alignment
+        The alignment to be set.
+    seq_names : iterable object of str
+        The names for the sequences in the alignment.
+        Must have the same length as the sequence count in `alignment`.
+    line_length : int, optional
+        The number of sequence characters per line in each block.
+        Default is 60.
+    """
+    gapped_seq_strings = alignment.get_gapped_sequences()
+    if len(gapped_seq_strings) != len(seq_names):
+        raise ValueError(
+            f"Alignment has {len(gapped_seq_strings)} sequences, "
+            f"but {len(seq_names)} names were given"
+        )
+    # Store sequences in internal dict
+    for name, seq_str in zip(seq_names, gapped_seq_strings):
+        clustal_file._entries[name] = seq_str
+    # Rebuild the text lines with the specified line length
+    clustal_file._rebuild_lines(line_length=line_length)
+
+
+def _convert_to_sequence(seq_str, seq_type=None):
+    if seq_type is not None:
+        if seq_type == NucleotideSequence:
+            return _convert_to_nucleotide(seq_str)
+        elif seq_type == ProteinSequence:
+            if "U" in seq_str:
+                warnings.warn(
+                    "ProteinSequence objects do not support selenocysteine "
+                    "(U), occurrences were substituted by cysteine (C)"
+                )
+            return _convert_to_protein(seq_str)
+        else:
+            return seq_type(seq_str)
+
+    try:
+        return _convert_to_nucleotide(seq_str)
+    except AlphabetError:
+        pass
+    try:
+        prot_seq = _convert_to_protein(seq_str)
+        if "U" in seq_str:
+            warnings.warn(
+                "ProteinSequence objects do not support selenocysteine (U), "
+                "occurrences were substituted by cysteine (C)"
+            )
+        return prot_seq
+    except AlphabetError:
+        raise ValueError(
+            "ClustalW data cannot be converted either to "
+            "'NucleotideSequence' nor to 'ProteinSequence'"
+        )
+
+
+def _convert_to_protein(seq_str):
+    return ProteinSequence(seq_str.upper().replace("U", "C").replace("O", "K"))
+
+
+def _convert_to_nucleotide(seq_str):
+    return NucleotideSequence(seq_str.upper().replace("U", "T").replace("X", "N"))
diff --git a/src/biotite/sequence/io/clustal/file.py b/src/biotite/sequence/io/clustal/file.py
new file mode 100644
index 000000000..e0badf822
--- /dev/null
+++ b/src/biotite/sequence/io/clustal/file.py
@@ -0,0 +1,162 @@
+# This source code is part of the Biotite package and is distributed
+# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
+# information.
+
+__name__ = "biotite.sequence.io.clustal"
+__author__ = "Biotite contributors"
+__all__ = ["ClustalFile"]
+
+from collections import OrderedDict
+from collections.abc import MutableMapping
+from biotite.file import InvalidFileError, TextFile
+
+
+_CONSENSUS_CHARS = frozenset(" *:.")
+
+
+class ClustalFile(TextFile, MutableMapping):
+    """
+    This class represents a file in ClustalW format (``.aln``).
+
+    A ClustalW file starts with a header line beginning with ``CLUSTAL``,
+    followed by blocks of aligned sequences separated by blank lines.
+    Each sequence line contains a sequence name followed by a segment of
+    the gapped sequence.
+    An optional consensus line, containing only ``*``, ``:``, ``.`` and
+    space characters, may follow each block.
+
+    This class is used in a dictionary-like manner, implementing the
+    :class:`MutableMapping` interface:
+    Sequence names are used as keys,
+    and strings containing the full gapped sequences are the
+    corresponding values.
+
+    Examples
+    --------
+
+    >>> import os.path
+    >>> file = ClustalFile()
+    >>> file["seq1"] = "ADTRCGTARDCGTR-DRTCGRAGD"
+    >>> file["seq2"] = "ADTRCGT---CGTRADRTCGRAGD"
+    >>> print(file["seq1"])
+    ADTRCGTARDCGTR-DRTCGRAGD
+    >>> for name, seq in file.items():
+    ...     print(name, seq)
+    seq1 ADTRCGTARDCGTR-DRTCGRAGD
+    seq2 ADTRCGT---CGTRADRTCGRAGD
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._entries = OrderedDict()
+
+    @classmethod
+    def read(cls, file):
+        """
+        Read a ClustalW file.
+
+        Parameters
+        ----------
+        file : file-like object or str
+            The file to be read.
+            Alternatively a file path can be supplied.
+
+        Returns
+        -------
+        file_object : ClustalFile
+            The parsed file.
+        """
+        file = super().read(file)
+        if len(file.lines) == 0:
+            raise InvalidFileError("File is empty")
+        file._parse()
+        return file
+
+    def _parse(self):
+        """
+        Parse the lines into sequence entries.
+        """
+        # Validate header
+        if not self.lines[0].startswith("CLUSTAL"):
+            raise InvalidFileError(
+                f"File does not start with 'CLUSTAL' header, "
+                f"got '{self.lines[0][:20]}...'"
+            )
+
+        self._entries = OrderedDict()
+        for line in self.lines[1:]:
+            stripped = line.strip()
+            # Skip blank lines
+            if len(stripped) == 0:
+                continue
+            # Skip consensus lines (lines that start with whitespace
+            # and contain only consensus characters)
+            if line[0] == " " and all(c in _CONSENSUS_CHARS for c in line):
+                continue
+            # Sequence lines: name followed by whitespace and sequence
+            # The name must start at the beginning of the line
+            # (no leading whitespace)
+            if line[0] != " ":
+                parts = line.split()
+                if len(parts) >= 2:
+                    name = parts[0]
+                    seq_segment = parts[1]
+                    if name in self._entries:
+                        self._entries[name] += seq_segment
+                    else:
+                        self._entries[name] = seq_segment
+
+    def __setitem__(self, name, seq_str):
+        if not isinstance(name, str):
+            raise IndexError(
+                "'ClustalFile' only supports sequence name strings as keys"
+            )
+        if not isinstance(seq_str, str):
+            raise TypeError(
+                "'ClustalFile' only supports sequence strings as values"
+            )
+        self._entries[name] = seq_str
+        self._rebuild_lines()
+
+    def __getitem__(self, name):
+        if not isinstance(name, str):
+            raise IndexError(
+                "'ClustalFile' only supports sequence name strings as keys"
+            )
+        return self._entries[name]
+
+    def __delitem__(self, name):
+        del self._entries[name]
+        self._rebuild_lines()
+
+    def __len__(self):
+        return len(self._entries)
+
+    def __iter__(self):
+        return self._entries.__iter__()
+
+    def __contains__(self, name):
+        return name in self._entries
+
+    def _rebuild_lines(self, line_length=60):
+        """
+        Rebuild the text lines from the stored entries.
+        """
+        self.lines = ["CLUSTAL W multiple sequence alignment", ""]
+
+        if len(self._entries) == 0:
+            return
+
+        names = list(self._entries.keys())
+        sequences = list(self._entries.values())
+        total_len = max(len(s) for s in sequences)
+        # Pad name column to accommodate the longest name + spacing
+        name_width = max(len(n) for n in names)
+
+        for start in range(0, total_len, line_length):
+            for name, seq_str in zip(names, sequences):
+                segment = seq_str[start : start + line_length]
+                if len(segment) > 0:
+                    padding = " " * (name_width - len(name) + 6)
+                    self.lines.append(f"{name}{padding}{segment}")
+            self.lines.append("")
diff --git a/tests/sequence/data/clustal.aln b/tests/sequence/data/clustal.aln
new file mode 100644
index 000000000..9d4296502
--- /dev/null
+++ b/tests/sequence/data/clustal.aln
@@ -0,0 +1,6 @@
+CLUSTAL W (1.83) multiple sequence alignment
+
+seq1            ADTRCGTARDCGTR-DRTCGRAGD
+seq2            ADTRCGT---CGTRADRTCGRAGD
+seq3            ADTRCGTARDCGTRADR--GRAGD
+                *******   ***** **  *****
diff --git a/tests/sequence/data/clustal_multi.aln b/tests/sequence/data/clustal_multi.aln
new file mode 100644
index 000000000..19a542f38
--- /dev/null
+++ b/tests/sequence/data/clustal_multi.aln
@@ -0,0 +1,9 @@
+CLUSTAL W (1.83) multiple sequence alignment
+
+seq1            ADTRCGTARDCGTR-DRTCGRAGD
+seq2            ADTRCGT---CGTRADRTCGRAGD
+                ***        **  *********
+
+seq1            ADTRCGTARDCGTR-DRTCGRAGD
+seq2            ADTRCGT---CGTRADRTCGRAGD
+                ***        **  *********
diff --git a/tests/sequence/test_clustal.py b/tests/sequence/test_clustal.py
new file mode 100644
index 000000000..10428e82e
--- /dev/null
+++ b/tests/sequence/test_clustal.py
@@ -0,0 +1,121 @@
+# This source code is part of the Biotite package and is distributed
+# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
+# information.
+
+import io
+import os
+import os.path
+import pytest
+import biotite.sequence as seq
+import biotite.sequence.io.clustal as clustal
+from tests.util import data_dir
+
+
+def test_access_low_level():
+    """Test reading a ClustalW file and accessing sequences by name."""
+    path = os.path.join(data_dir("sequence"), "clustal.aln")
+    file = clustal.ClustalFile.read(path)
+    assert file["seq1"] == "ADTRCGTARDCGTR-DRTCGRAGD"
+    assert file["seq2"] == "ADTRCGT---CGTRADRTCGRAGD"
+    assert file["seq3"] == "ADTRCGTARDCGTRADR--GRAGD"
+    assert list(file.keys()) == ["seq1", "seq2", "seq3"]
+
+
+def test_access_multi_block():
+    """Test reading a ClustalW file with multiple blocks."""
+    path = os.path.join(data_dir("sequence"), "clustal_multi.aln")
+    file = clustal.ClustalFile.read(path)
+    assert file["seq1"] == (
+        "ADTRCGTARDCGTR-DRTCGRAGD"
+        "ADTRCGTARDCGTR-DRTCGRAGD"
+    )
+    assert file["seq2"] == (
+        "ADTRCGT---CGTRADRTCGRAGD"
+        "ADTRCGT---CGTRADRTCGRAGD"
+    )
+
+
+def test_dict_interface():
+    """Test dictionary-like interface of ClustalFile."""
+    file = clustal.ClustalFile()
+    file["seq1"] = "ADTRCGT"
+    file["seq2"] = "ADTRCGT"
+    assert len(file) == 2
+    assert "seq1" in file
+    assert list(file) == ["seq1", "seq2"]
+
+    del file["seq1"]
+    assert len(file) == 1
+    assert "seq1" not in file
+
+
+def test_alignment_conversion():
+    """Test conversion between ClustalFile and Alignment object."""
+    path = os.path.join(data_dir("sequence"), "clustal.aln")
+    file = clustal.ClustalFile.read(path)
+    alignment = clustal.get_alignment(file)
+    assert str(alignment) == (
+        "ADTRCGTARDCGTR-DRTCGRAGD\n"
+        "ADTRCGT---CGTRADRTCGRAGD\n"
+        "ADTRCGTARDCGTRADR--GRAGD"
+    )  # fmt: skip
+
+
+def test_round_trip():
+    """Test writing an alignment and reading it back."""
+    path = os.path.join(data_dir("sequence"), "clustal.aln")
+    file = clustal.ClustalFile.read(path)
+    alignment = clustal.get_alignment(file)
+
+    # Write alignment to a new ClustalFile
+    file2 = clustal.ClustalFile()
+    clustal.set_alignment(
+        file2, alignment, seq_names=["seq1", "seq2", "seq3"]
+    )
+    alignment2 = clustal.get_alignment(file2)
+    assert str(alignment) == str(alignment2)
+
+
+def test_write_read_round_trip():
+    """Test that writing to a file and reading back yields the same
+    alignment."""
+    path = os.path.join(data_dir("sequence"), "clustal.aln")
+    file = clustal.ClustalFile.read(path)
+    alignment = clustal.get_alignment(file)
+
+    # Write to string buffer
+    buffer = io.StringIO()
+    file2 = clustal.ClustalFile()
+    clustal.set_alignment(
+        file2, alignment, seq_names=["seq1", "seq2", "seq3"]
+    )
+    file2.write(buffer)
+
+    # Read back from string buffer
+    buffer.seek(0)
+    file3 = clustal.ClustalFile.read(buffer)
+    alignment3 = clustal.get_alignment(file3)
+    assert str(alignment) == str(alignment3)
+
+
+def test_name_count_mismatch():
+    """Test that mismatched name count raises ValueError."""
+    path = os.path.join(data_dir("sequence"), "clustal.aln")
+    file = clustal.ClustalFile.read(path)
+    alignment = clustal.get_alignment(file)
+
+    file2 = clustal.ClustalFile()
+    with pytest.raises(ValueError):
+        clustal.set_alignment(
+            file2, alignment, seq_names=["seq1", "seq2"]
+        )
+
+
+@pytest.mark.parametrize("seq_type", (None, seq.ProteinSequence))
+def test_seq_type(seq_type):
+    """Test explicit sequence type parameter."""
+    path = os.path.join(data_dir("sequence"), "clustal.aln")
+    file = clustal.ClustalFile.read(path)
+    alignment = clustal.get_alignment(file, seq_type=seq_type)
+    # Should produce a valid alignment regardless of type
+    assert len(alignment.sequences) == 3

From 83b69f9997e364e967a0c179f15584c6d03ed0ea Mon Sep 17 00:00:00 2001
From: SexyERIC0723 <haoyuwang144@gmail.com>
Date: Sun, 5 Apr 2026 16:30:57 +0100
Subject: [PATCH 2/2] fix: address edge cases in ClustalW format (Codex review
 findings)

- Validate sequence count >= 2 in get_alignment() with clear error
- Validate line_length > 0 in set_alignment()
- Clear existing entries before writing in set_alignment() to prevent
  stale data when reusing a ClustalFile object
---
 src/biotite/sequence/io/clustal/convert.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/biotite/sequence/io/clustal/convert.py b/src/biotite/sequence/io/clustal/convert.py
index 5ddb84e3e..613f9b800 100644
--- a/src/biotite/sequence/io/clustal/convert.py
+++ b/src/biotite/sequence/io/clustal/convert.py
@@ -35,6 +35,11 @@ def get_alignment(clustal_file, seq_type=None):
         The alignment from the :class:`ClustalFile`.
     """
     seq_strings = list(clustal_file.values())
+    if len(seq_strings) < 2:
+        raise ValueError(
+            f"ClustalW file contains {len(seq_strings)} sequence(s), "
+            f"but an alignment requires at least 2"
+        )
     return Alignment.from_strings(
         seq_strings,
         functools.partial(_convert_to_sequence, seq_type=seq_type),
@@ -59,13 +64,18 @@ def set_alignment(clustal_file, alignment, seq_names, line_length=60):
         The number of sequence characters per line in each block.
         Default is 60.
     """
+    if line_length < 1:
+        raise ValueError(
+            f"'line_length' must be at least 1, got {line_length}"
+        )
     gapped_seq_strings = alignment.get_gapped_sequences()
     if len(gapped_seq_strings) != len(seq_names):
         raise ValueError(
             f"Alignment has {len(gapped_seq_strings)} sequences, "
             f"but {len(seq_names)} names were given"
         )
-    # Store sequences in internal dict
+    # Clear any existing entries before writing new data
+    clustal_file._entries.clear()
     for name, seq_str in zip(seq_names, gapped_seq_strings):
         clustal_file._entries[name] = seq_str
     # Rebuild the text lines with the specified line length