ruk·si

🐍 Python
Files as Recursive Objects

Updated at 2019-06-07 11:59

If you are dealing with a lot of files, it might make sense to create more abstractions on top of the raw data. But keep in mind how much performance and memory you are using with all of these levels of abstraction.

from typing import Optional, Union, List

class Text:

    _data: str
    _paragraphs: List[str]
    _lines: List[str]
    _words: List[str]

    def __init__(
        self,
        filename: Optional[str] = None,
        data: Optional[Union[str, 'Text']] = None,
    ) -> 'Text':
        if data is not None:
            self._data = str(data)
        elif filename is not None:
            self._data = str(open(filename).read())
        else:
            raise Exception('filename or data is required')
        self._paragraphs = self._data.strip('\n').split('\n\n')
        self._lines = self._data.strip('\n').split('\n')
        self._words = self._data.strip('\n').split()

    def __repr__(self) -> str:
        return self._data

    @property
    def paragraphs(self) -> List['Text']:
        return [Text(data=p) for p in self._paragraphs]

    def paragraph(self, index) -> 'Text':
        return Text(data=self._paragraphs[index])

    @property
    def lines(self) -> List['Text']:
        return [Text(data=l) for l in self._lines]

    def line(self, index) -> 'Text':
        return Text(data=self._lines[index])

    @property
    def words(self) -> List['Text']:
        return [Text(data=w) for w in self._words]

    def word(self, index) -> 'Text':
        return Text(data=self._words[index])

import tempfile

tf = tempfile.NamedTemporaryFile()
tf.write(b'''
Evolution of Species

This is the first paragraph.
There can be multiple lines per paragraph.

This is the last line.
''')
tf.seek(0)
text = Text(tf.name)

assert len(text.paragraphs) == 3
assert str(text.paragraph(0)) == 'Evolution of Species'
assert str(text.paragraph(1)).count('\n') == 1
assert str(text.paragraph(2)).count('\n') == 0

# note that an empty line is still a line
assert len(text.lines) == 6
assert str(text.line(0)) == 'Evolution of Species'
assert str(text.line(1)) == ''
assert str(text.line(2)) == 'This is the first paragraph.'
assert str(text.line(-1)) == 'This is the last line.'

assert len(text.words) == 20
assert str(text.word(0)) == 'Evolution'
assert str(text.word(-1)) == 'line.'

# this allows complex lookups, like the following:

# "second paragraph, second line, last word"
assert str(text.paragraph(1).line(1).word(-1)) == 'paragraph.'

# "second word of each paragraph"
assert [str(p.word(1)) for p in text.paragraphs] == ['of', 'is', 'is']

tf.close()