Markov/src/rulesparser.py

from rule import Rule, EMPTY_SYMBOL
import re

class RulesParser:
    TRANSFORM = r'->'
    B_TRANSFORM = r'->\|'
    NEWLINE = '\n'
    IGNORE = r'\s+'
    COMMENTS = r'//.+$'
    EMPTY = re.escape(EMPTY_SYMBOL)

    def parse(self, file: str) -> list[Rule]:
        """
        Parsing file according to syntax, specified
        in class variables. Hardcoded for now (and forever)
        """
        with open(file, 'r') as f:
            text = f.read()
        lines = self._get_lines(text)
        rules = list()
        for line in lines:
            rules.append(self._parse_rule(line))
        return rules

    def _parse_rule(self, line: str) -> Rule:
        """
        tries to parse rule according to set grammar
        """
        tokens = re.split(self.IGNORE, line)
        # we always expect 3 parts: operand, arrow, target
        arrow = tokens[1]
        is_blocking = None
        if re.fullmatch(self.TRANSFORM, arrow):
            is_blocking = False
        elif re.fullmatch(self.B_TRANSFORM, arrow):
            is_blocking = True
        else:
            raise ValueError(f"Can't recognize transform symbol. "
                             f"\"{self.TRANSFORM}\" or \"{self.B_TRANSFORM}\""
                             f" expected, but \"{arrow}\" encountered")
        return Rule(
                operand=tokens[0],
                target=tokens[2],
                is_blocking=is_blocking
                )

    def _get_lines(self, src: str) -> list[str]:
        """
        Get cleaned lines only with rules to parse
        """
        text = self._remove_comments(src)
        text = self._strip_lines(text)
        lines = list(filter(lambda x: x != '', text.split(self.NEWLINE)))
        return lines

    def _remove_comments(self, src: str) -> str:
        """
        removes comments from end of lines and returns
        cleaned text
        """
        return re.sub(self.COMMENTS, '', src, flags=re.M)

    def _strip_lines(self, src: str) -> str:
        """
        Strips whitespaces at the end of lines
        """
        result = re.sub(r' +$', '', src, flags=re.M)
        # result = re.sub(r"\n+", r'\n', result)
        return result