biotite-dev · haoyu-haoyu · Apr 5, 2026 · Apr 5, 2026
diff --git a/src/biotite/sequence/io/clustal/__init__.py b/src/biotite/sequence/io/clustal/__init__.py
@@ -0,0 +1,21 @@
+# This source code is part of the Biotite package and is distributed
+# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
+# information.
+
+"""
+This subpackage is used for reading and writing sequence alignment objects
+using the ClustalW format (``.aln``).
+
+This package contains the :class:`ClustalFile`, which provides a
+dictionary-like interface to ClustalW files, where sequence names are
+keys and gapped sequence strings are the corresponding values.
+
+Furthermore, the package contains convenience functions for
+getting/setting directly :class:`Alignment` objects, rather than strings.
+"""
+
+__name__ = "biotite.sequence.io.clustal"
+__author__ = "Biotite contributors"
+
+from .convert import *
+from .file import *
diff --git a/src/biotite/sequence/io/clustal/convert.py b/src/biotite/sequence/io/clustal/convert.py
@@ -0,0 +1,123 @@
+# This source code is part of the Biotite package and is distributed
+# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
+# information.
+
+__name__ = "biotite.sequence.io.clustal"
+__author__ = "Biotite contributors"
+
+import functools
+import warnings
+from collections import OrderedDict
+from biotite.sequence.align.alignment import Alignment
+from biotite.sequence.alphabet import AlphabetError, LetterAlphabet
+from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence
+
+__all__ = ["get_alignment", "set_alignment"]
+
+
+def get_alignment(clustal_file, seq_type=None):
+    """
+    Get an alignment from a :class:`ClustalFile` instance.
+
+    Parameters
+    ----------
+    clustal_file : ClustalFile
+        The :class:`ClustalFile` to be accessed.
+    seq_type : type[Sequence], optional
+        The :class:`Sequence` subclass contained in the file.
+        If not set, the type is automatically inferred as
+        :class:`ProteinSequence` or :class:`NucleotideSequence`.
+        For large sequence data it is recommended to set this parameter.
+
+    Returns
+    -------
+    alignment : Alignment
+        The alignment from the :class:`ClustalFile`.
+    """
+    seq_strings = list(clustal_file.values())
+    if len(seq_strings) < 2:
+        raise ValueError(
+            f"ClustalW file contains {len(seq_strings)} sequence(s), "
+            f"but an alignment requires at least 2"
+        )
+    return Alignment.from_strings(
+        seq_strings,
+        functools.partial(_convert_to_sequence, seq_type=seq_type),
+    )
+
+
+def set_alignment(clustal_file, alignment, seq_names, line_length=60):
+    """
+    Fill a :class:`ClustalFile` with gapped sequence strings from an
+    alignment.
+
+    Parameters
+    ----------
+    clustal_file : ClustalFile
+        The :class:`ClustalFile` to be accessed.
+    alignment : Alignment
+        The alignment to be set.
+    seq_names : iterable object of str
+        The names for the sequences in the alignment.
+        Must have the same length as the sequence count in `alignment`.
+    line_length : int, optional
+        The number of sequence characters per line in each block.
+        Default is 60.
+    """
+    if line_length < 1:
+        raise ValueError(
+            f"'line_length' must be at least 1, got {line_length}"
+        )
+    gapped_seq_strings = alignment.get_gapped_sequences()
+    if len(gapped_seq_strings) != len(seq_names):
+        raise ValueError(
+            f"Alignment has {len(gapped_seq_strings)} sequences, "
+            f"but {len(seq_names)} names were given"
+        )
+    # Clear any existing entries before writing new data
+    clustal_file._entries.clear()
+    for name, seq_str in zip(seq_names, gapped_seq_strings):
+        clustal_file._entries[name] = seq_str
+    # Rebuild the text lines with the specified line length
+    clustal_file._rebuild_lines(line_length=line_length)
+
+
+def _convert_to_sequence(seq_str, seq_type=None):
+    if seq_type is not None:
+        if seq_type == NucleotideSequence:
+            return _convert_to_nucleotide(seq_str)
+        elif seq_type == ProteinSequence:
+            if "U" in seq_str:
+                warnings.warn(
+                    "ProteinSequence objects do not support selenocysteine "
+                    "(U), occurrences were substituted by cysteine (C)"
+                )
+            return _convert_to_protein(seq_str)
+        else:
+            return seq_type(seq_str)
+
+    try:
+        return _convert_to_nucleotide(seq_str)
+    except AlphabetError:
+        pass
+    try:
+        prot_seq = _convert_to_protein(seq_str)
+        if "U" in seq_str:
+            warnings.warn(
+                "ProteinSequence objects do not support selenocysteine (U), "
+                "occurrences were substituted by cysteine (C)"
+            )
+        return prot_seq
+    except AlphabetError:
+        raise ValueError(
+            "ClustalW data cannot be converted either to "
+            "'NucleotideSequence' nor to 'ProteinSequence'"
+        )
+
+
+def _convert_to_protein(seq_str):
+    return ProteinSequence(seq_str.upper().replace("U", "C").replace("O", "K"))
+
+
+def _convert_to_nucleotide(seq_str):
+    return NucleotideSequence(seq_str.upper().replace("U", "T").replace("X", "N"))
diff --git a/src/biotite/sequence/io/clustal/file.py b/src/biotite/sequence/io/clustal/file.py
@@ -0,0 +1,162 @@
+# This source code is part of the Biotite package and is distributed
+# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
+# information.
+
+__name__ = "biotite.sequence.io.clustal"
+__author__ = "Biotite contributors"
+__all__ = ["ClustalFile"]
+
+from collections import OrderedDict
+from collections.abc import MutableMapping
+from biotite.file import InvalidFileError, TextFile
+
+
+_CONSENSUS_CHARS = frozenset(" *:.")
+
+
+class ClustalFile(TextFile, MutableMapping):
+    """
+    This class represents a file in ClustalW format (``.aln``).
+
+    A ClustalW file starts with a header line beginning with ``CLUSTAL``,
+    followed by blocks of aligned sequences separated by blank lines.
+    Each sequence line contains a sequence name followed by a segment of
+    the gapped sequence.
+    An optional consensus line, containing only ``*``, ``:``, ``.`` and
+    space characters, may follow each block.
+
+    This class is used in a dictionary-like manner, implementing the
+    :class:`MutableMapping` interface:
+    Sequence names are used as keys,
+    and strings containing the full gapped sequences are the
+    corresponding values.
+
+    Examples
+    --------
+
+    >>> import os.path
+    >>> file = ClustalFile()
+    >>> file["seq1"] = "ADTRCGTARDCGTR-DRTCGRAGD"
+    >>> file["seq2"] = "ADTRCGT---CGTRADRTCGRAGD"
+    >>> print(file["seq1"])
+    ADTRCGTARDCGTR-DRTCGRAGD
+    >>> for name, seq in file.items():
+    ...     print(name, seq)
+    seq1 ADTRCGTARDCGTR-DRTCGRAGD
+    seq2 ADTRCGT---CGTRADRTCGRAGD
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._entries = OrderedDict()
+
+    @classmethod
+    def read(cls, file):
+        """
+        Read a ClustalW file.
+
+        Parameters
+        ----------
+        file : file-like object or str
+            The file to be read.
+            Alternatively a file path can be supplied.
+
+        Returns
+        -------
+        file_object : ClustalFile
+            The parsed file.
+        """
+        file = super().read(file)
+        if len(file.lines) == 0:
+            raise InvalidFileError("File is empty")
+        file._parse()
+        return file
+
+    def _parse(self):
+        """
+        Parse the lines into sequence entries.
+        """
+        # Validate header
+        if not self.lines[0].startswith("CLUSTAL"):
+            raise InvalidFileError(
+                f"File does not start with 'CLUSTAL' header, "
+                f"got '{self.lines[0][:20]}...'"
+            )
+
+        self._entries = OrderedDict()
+        for line in self.lines[1:]:
+            stripped = line.strip()
+            # Skip blank lines
+            if len(stripped) == 0:
+                continue
+            # Skip consensus lines (lines that start with whitespace
+            # and contain only consensus characters)
+            if line[0] == " " and all(c in _CONSENSUS_CHARS for c in line):
+                continue
+            # Sequence lines: name followed by whitespace and sequence
+            # The name must start at the beginning of the line
+            # (no leading whitespace)
+            if line[0] != " ":
+                parts = line.split()
+                if len(parts) >= 2:
+                    name = parts[0]
+                    seq_segment = parts[1]
+                    if name in self._entries:
+                        self._entries[name] += seq_segment
+                    else:
+                        self._entries[name] = seq_segment
+
+    def __setitem__(self, name, seq_str):
+        if not isinstance(name, str):
+            raise IndexError(
+                "'ClustalFile' only supports sequence name strings as keys"
+            )
+        if not isinstance(seq_str, str):
+            raise TypeError(
+                "'ClustalFile' only supports sequence strings as values"
+            )
+        self._entries[name] = seq_str
+        self._rebuild_lines()
+
+    def __getitem__(self, name):
+        if not isinstance(name, str):
+            raise IndexError(
+                "'ClustalFile' only supports sequence name strings as keys"
+            )
+        return self._entries[name]
+
+    def __delitem__(self, name):
+        del self._entries[name]
+        self._rebuild_lines()
+
+    def __len__(self):
+        return len(self._entries)
+
+    def __iter__(self):
+        return self._entries.__iter__()
+
+    def __contains__(self, name):
+        return name in self._entries
+
+    def _rebuild_lines(self, line_length=60):
+        """
+        Rebuild the text lines from the stored entries.
+        """
+        self.lines = ["CLUSTAL W multiple sequence alignment", ""]
+
+        if len(self._entries) == 0:
+            return
+
+        names = list(self._entries.keys())
+        sequences = list(self._entries.values())
+        total_len = max(len(s) for s in sequences)
+        # Pad name column to accommodate the longest name + spacing
+        name_width = max(len(n) for n in names)
+
+        for start in range(0, total_len, line_length):
+            for name, seq_str in zip(names, sequences):
+                segment = seq_str[start : start + line_length]
+                if len(segment) > 0:
+                    padding = " " * (name_width - len(name) + 6)
+                    self.lines.append(f"{name}{padding}{segment}")
+            self.lines.append("")
diff --git a/tests/sequence/data/clustal.aln b/tests/sequence/data/clustal.aln
@@ -0,0 +1,6 @@
+CLUSTAL W (1.83) multiple sequence alignment
+
+seq1            ADTRCGTARDCGTR-DRTCGRAGD
+seq2            ADTRCGT---CGTRADRTCGRAGD
+seq3            ADTRCGTARDCGTRADR--GRAGD
+                *******   ***** **  *****
diff --git a/tests/sequence/data/clustal_multi.aln b/tests/sequence/data/clustal_multi.aln
@@ -0,0 +1,9 @@
+CLUSTAL W (1.83) multiple sequence alignment
+
+seq1            ADTRCGTARDCGTR-DRTCGRAGD
+seq2            ADTRCGT---CGTRADRTCGRAGD
+                ***        **  *********
+
+seq1            ADTRCGTARDCGTR-DRTCGRAGD
+seq2            ADTRCGT---CGTRADRTCGRAGD
+                ***        **  *********