Markov/src/rulesparser.py

from rule import Rule, EMPTY_SYMBOL
import re

class RulesParser:
    TRANSFORM = r'->'
    B_TRANSFORM = r'->\|'
    NEWLINE = '\n'
    IGNORE = r'\s+'
    COMMENTS = r'//.+$'
    EMPTY = re.escape(EMPTY_SYMBOL)

    def parse(self, file: str) -> list[Rule]:
        """
        Parsing file according to syntax, specified
        in class variables. Hardcoded for now (and forever)
        """
        with open(file, 'r') as f:
            text = f.read()
        lines = self._get_lines(text)
        rules = list()
        for line in lines:
            rules.append(self._parse_rule(line))
        return rules

    def _parse_rule(self, line: str) -> Rule:
        """
        tries to parse rule according to set grammar
        """
        tokens = re.split(self.IGNORE, line)
        # we always expect 3 parts: operand, arrow, target
        arrow = tokens[1]
        is_blocking = None
        if re.fullmatch(self.TRANSFORM, arrow):
            is_blocking = False
        elif re.fullmatch(self.B_TRANSFORM, arrow):
            is_blocking = True
        else:
            raise ValueError(f"Can't recognize transform symbol. "
                             f"\"{self.TRANSFORM}\" or \"{self.B_TRANSFORM}\""
                             f" expected, but \"{arrow}\" encountered")

        #optimising empty symbol
        return Rule(
                operand=self._optimise_empty(tokens[0]),
                target=self._optimise_empty(tokens[2]),
                is_blocking=is_blocking
                )

    def _optimise_empty(self, string: str) -> str:
        """
        Empty symbol has meaning only while it's the only
        symbol in the string (I hope i'm not wrong right now),
        so all empty symbols can be optimised

        Returns sting without EMPTY symbols if deleting them
        is semantically possible, returns unchanges string if
        nothing can be optimised

        NOTE: right now contains naive implementation
        """
        string = re.sub(self.EMPTY+'+', EMPTY_SYMBOL, string)
        if re.fullmatch(self.EMPTY, string):
            return string
        return re.sub(self.EMPTY, '', string)

    def _get_lines(self, src: str) -> list[str]:
        """
        Get cleaned lines only with rules to parse
        """
        text = self._remove_comments(src)
        text = self._strip_lines(text)
        lines = list(filter(lambda x: x != '', text.split(self.NEWLINE)))
        return lines

    def _remove_comments(self, src: str) -> str:
        """
        removes comments from end of lines and returns
        cleaned text
        """
        return re.sub(self.COMMENTS, '', src, flags=re.M)

    def _strip_lines(self, src: str) -> str:
        """
        Strips whitespaces at the end of lines
        """
        result = re.sub(r' +$', '', src, flags=re.M)
        return result