Python mailbox.mbox is not good at opening compressed mailboxes:

>>> import mailbox
>>> print(len(mailbox.mbox("/tmp/test.mbox")))
9
>>> print(len(mailbox.mbox("/tmp/test.mbox.gz")))
0
>>> print(len(mailbox.mbox("/tmp/test1.mbox.xz")))
0

For a prototype rewrite of the MIA team's Echelon (the engine behind mia-query), I needed to scan compressed mailboxes, and I had to work around this limitation.

Here is the alternative mailbox.mbox implementation:

import lzma
import gzip
import bz2
import mailbox


class StreamMbox(mailbox.mbox):
    """
    mailbox.mbox does not support opening a stream, which is sad.

    This is a subclass that works around it
    """
    def __init__(self, fd: BinaryIO, factory=None, create: bool = True):
        # Do not call parent __init__, just redo everything here to be able to
        # open a stream. This will need to be re-reviewed for every new version
        # of python's stdlib.

        # Mailbox constructor
        self._path = None
        self._factory = factory

        # _singlefileMailbox constructor
        self._file = fd
        self._toc = None
        self._next_key = 0
        self._pending = False       # No changes require rewriting the file.
        self._pending_sync = False  # No need to sync the file
        self._locked = False
        self._file_length = None    # Used to record mailbox size

        # mbox constructor
        self._message_factory = mailbox.mboxMessage

    def flush(self):
        raise NotImplementedError("StreamMbox is a readonly class")


class UsageExample:
    DECOMPRESS = {
        ".xz": lzma.open,
        ".gz": gzip.open,
        ".bz2": bz2.open,
    }

    @classmethod
    def scan(cls, path: Path) -> Generator[ScannedEmail, None, None]:
        decompress = cls.DECOMPRESS.get(path.suffix)
        if decompress is None:
            with open(path.as_posix(), "rb") as fd:
                yield from cls.scan_fd(path, fd)
        else:
            with decompress(path.as_posix(), "rb") as fd:
                yield from cls.scan_fd(path, fd)

    @classmethod
    def scan_fd(cls, path: Path, fd: BinaryIO) -> Generator[ScannedEmail, None, None]:
        mbox = StreamMbox(fd)
        for msg in mbox:
            ...

pdo debian eng python devel hacks

2019-03-05 17:57:33+01:00