Source code for EduNLP.SIF.parser.parser

from EduNLP.Formula.ast import str2ast, katex_parse
import re


[docs]class Parser:
    """
    Parse the item to standard format.
    """
    def __init__(self, data, check_formula=True):
        self.lookahead = 0
        self.head = 0
        self.text = data
        self.error_message = ''
        self.error_postion = 0
        self.error_flag = 0
        self.modify_flag = 0
        self.warnning = 0
        self.fomula_illegal_flag = 0
        self.fomula_illegal_message = ''
        self.check_formula = check_formula

        # 定义特殊变量
        self.len_bracket = len('$\\SIFChoice$')
        self.len_underline = len('$\\SIFBlank$')

        # 定义 token
        self.error = -1
        self.character = 1
        self.en_pun = 2
        self.ch_pun = 3
        self.latex = 4
        self.end = 5
        self.empty = 6
        self.modify = 7
        self.blank = 8

        self.en_pun_list = [',', '.', '?', '!',
                            ':', ';', '\'', '\"', '(', ')', ' ', '_', '/', '|', '\\', '<', '>', '[', ']',
                            '-']  # add some other chars
        self.ch_pun_list = ['，', '。', '！', '？', '：',
                            '；', '‘', '’', '“', '”', '（', '）', ' ', '、', '《', '》', '—', '．']
        self.in_list = [',', '_', '-', '%']
        self.flag_list = ['，', '。', '！', '？', '：',
                          '；', '‘', '’', '“', '”', '（', '）', ' ', '、', '《', '》',
                          '$', ',', '.', '?', '!', ':', ';', '\'', '\"', '(', ')', ' ', '_', '/', '|', '<', '>', '-',
                          '[', ']', '—']

[docs]    def is_number(self, uchar):
        """判断一个unicode是否是数字"""
        if u'\u0030' <= uchar <= u'\u0039':
            # print(uchar, ord(uchar))(u'\u0030' <= uchar <= u'\u0039')
            return True
        else:
            return False

[docs]    def is_alphabet(self, uchar):
        """判断一个unicode是否是英文字母"""
        if (u'\u0041' <= uchar <= u'\u005a') or (u'\u0061' <= uchar <= u'\u007a'):
            return True
        else:
            return False

[docs]    def is_chinese(self, uchar):
        """判断一个unicode是否是汉字"""
        if u'\u4e00' <= uchar <= u'\u9fa5':
            return True
        else:
            return False

    def _is_formula_legal(self, formula_str):
        r"""
        Judge whether the current formula meet our specification or not.

        Parameters
        ----------
        formula_str

        Returns
        -------
        True or False

        """
        legal_tags = ['FormFigureID', 'FormFigureBase64', 'FigureID', 'FigureBase64',
                      'SIFBlank', 'SIFChoice', 'SIFTag', 'SIFSep', 'SIFUnderline', 'textf']
        for tag in legal_tags:
            if tag in formula_str:
                return True
        try:
            katex_parse(formula_str)
        except Exception as e:
            assert 'ParseError' in str(e)
            self.fomula_illegal_message = "[FormulaError] " + str(e)
            self.fomula_illegal_flag = 1
            return False
        return True

[docs]    def call_error(self):
        """语法解析函数"""
        # print('ERROR::position is >>> ',self.head)
        # print('ERROR::match is >>>', self.text[self.head])
        self.error_postion = self.head
        self.error_message = self.text[:self.head + 1]
        self.error_flag = 1

[docs]    def get_token(self):
        r"""
        Get different elements in the item.

        Parameters
        ----------

        Returns
        -------
        elements:chinese,alphabet,number,ch_pun_list,en_pun_list,latex formula

        """
        if self.head >= len(self.text):
            return self.empty
        ch = self.text[self.head]
        if self.is_chinese(ch):
            # 匹配中文字符 [\u4e00-\u9fa5]
            self.head += 1
            return self.character
        elif self.is_alphabet(ch):
            # 匹配公式之外的英文字母，只对两个汉字之间的字母做修正，其余匹配到的情况视为不合 latex 语法录入的公式
            left = head = self.head
            if self.head == 0:
                while (head < len(self.text) and (
                        self.is_alphabet(self.text[head]) or self.text[head] in self.in_list)):
                    head += 1
                if head == len(self.text) or self.is_chinese(self.text[head]) or self.text[head] in self.flag_list:
                    self.head = head
                    self.text = self.text[:left] + "$" + self.text[left:head] + "$" + self.text[head:]
                    self.head += 2
                    #                     print(self.text[left:self.head])
                    self.modify = 1
                    return self.modify
            else:
                forward = self.text[self.head - 1]
                if self.is_chinese(forward) or forward in self.flag_list:
                    while (head < len(self.text) and (
                            self.is_alphabet(self.text[head]) or self.text[head] in self.in_list)):
                        head += 1
                    if head == len(self.text) or self.is_chinese(self.text[head]) or self.text[head] in self.flag_list:
                        self.head = head
                        self.text = self.text[:left] + "$" + self.text[left:head] + "$" + self.text[head:]
                        self.head += 2
                        self.modify_flag = 1
                        return self.modify
            # self.call_error()
            # return self.error

        elif self.is_number(ch):
            # 匹配公式之外的数字，只对两个汉字之间的数字做修正，其余匹配到的情况视为不合 latex 语法录入的公式
            left = head = self.head
            if self.head == 0:
                while (head < len(self.text) and (
                        self.is_number(self.text[head]) or self.text[head] in self.in_list)):
                    head += 1
                if head == len(self.text) or self.is_chinese(self.text[head]) or self.text[head] in self.flag_list:
                    self.head = head
                    self.text = self.text[:left] + "$" + self.text[left:head] + "$" + self.text[head:]
                    self.head += 2
                    self.modify_flag = 1
                    return self.modify

            else:
                forward = self.text[self.head - 1]
                if self.is_chinese(forward) or forward in self.flag_list:
                    while (head < len(self.text) and (
                            self.is_number(self.text[head]) or self.text[head] in self.in_list)):
                        head += 1

                    if head == len(self.text) or self.is_chinese(self.text[head]) or self.text[head] in self.flag_list:
                        self.head = head
                        self.text = self.text[:left] + "$" + self.text[left:head] + "$" + self.text[head:]
                        self.head += 2
                        self.modify_flag = 1
                        return self.modify
            # self.call_error()
            # return self.error

        elif ch == '\n':
            # 匹配换行符
            self.head += 1
            return self.end

        elif ch in self.ch_pun_list:
            # 匹配中文标点
            left = self.head
            self.head += 1
            if self.text[left] == '（':
                # 匹配到一个左括号
                while self.text[self.head] == ' ' or self.text[self.head] == '\xa0':
                    self.head += 1
                if self.text[self.head] == '）':
                    self.head += 1
                    self.text = self.text[:left] + '$\\SIFChoice$' + self.text[self.head:]
                    self.head += self.len_bracket
                    self.modify_flag = 1
                    return self.modify
            return self.ch_pun
        elif ch in self.en_pun_list:
            # 匹配英文标点
            # print('en-pun-list')
            left = self.head
            self.head += 1
            if self.text[left] == '(':
                # 匹配到一个左括号
                while self.text[self.head] == ' ' or self.text[self.head] == '\xa0':
                    self.head += 1
                if self.text[self.head] == ')':
                    self.head += 1
                    self.text = self.text[:left] + '$\\SIFChoice$' + self.text[self.head:]
                    self.head += self.len_bracket
                    self.modify_flag = 1
                    return self.modify
            if self.text[left] == '_':
                # 匹配到一个下划线
                # print('this is an underline')
                while self.text[self.head] == '_' or self.text[self.head] == ' ':
                    self.head += 1
                    if self.head >= len(self.text):
                        break
                # print('change the text')
                self.text = self.text[:left] + '$\\SIFBlank$' + self.text[self.head:]
                self.head += self.len_underline
                # print(self.text)
                self.modify_flag = 1
                return self.modify
            return self.en_pun

        elif ch == '$':
            # 匹配 latex 公式
            self.head += 1
            flag = 1
            formula_start = self.head
            while self.head < len(self.text) and self.text[self.head] != '$':
                ch_informula = self.text[self.head]
                if flag and self.is_chinese(ch_informula):
                    # latex 中出现非法中文字符，打印且只打印一次 warning
                    print("Warning: there is some chinese characters in formula!")
                    self.warnning = 1
                    flag = 0
                self.head += 1
            if self.head >= len(self.text):
                self.call_error()
                return self.error

            # 检查latex公式的完整性和可解析性
            if self.check_formula and not self._is_formula_legal(self.text[formula_start:self.head]):
                self.call_error()
                return self.error
            self.head += 1
            # print('is latex!')
            return self.latex
        else:
            self.call_error()
            return self.error

[docs]    def next_token(self):
        #         print('call next_token')
        #         if self.error_flag:
        #             return
        self.lookahead = self.get_token()
        if self.error_flag:
            return

[docs]    def match(self, terminal):
        #         print('call match')
        # if self.error_flag:
        #     return
        if self.lookahead == terminal:
            self.next_token()
            if self.error_flag:
                return
        # else:
        #     print('match error!')
        #     self.call_error()

[docs]    def txt(self):
        #         print('call txt')
        #         if self.error_flag:
        #             return
        self.lookahead = self.get_token()
        if self.error_flag:
            return
        if self.lookahead == self.character or self.lookahead == self.en_pun or \
                self.lookahead == self.ch_pun or self.lookahead == self.latex:
            self.match(self.lookahead)

[docs]    def txt_list(self):
        #         print('call txt_list')
        #         if self.error_flag:
        #             return
        self.txt()
        if self.error_flag:
            return
        if self.lookahead != self.empty:
            self.txt_list()

[docs]    def description(self):
        #         print('call description')
        #         if self.error_flag:
        #             return
        self.txt_list()
        if self.error_flag:
            return
        if self.lookahead == self.empty:
            self.match(self.lookahead)

[docs]    def description_list(self):
        r"""
        use Parser to process and describe the txt

        Parameters
        ----------

        Returns
        ----------

        Examples
        --------
        >>> text = '生产某种零件的A工厂25名工人的日加工零件数_   _'
        >>> text_parser = Parser(text)
        >>> text_parser.description_list()
        >>> text_parser.text
        '生产某种零件的$A$工厂$25$名工人的日加工零件数$\\SIFBlank$'
        >>> text = 'X的分布列为(   )'
        >>> text_parser = Parser(text)
        >>> text_parser.description_list()
        >>> text_parser.text
        '$X$的分布列为$\\SIFChoice$'
        >>> text = '① AB是⊙O的直径，AC是⊙O的切线，BC交⊙O于点E．AC的中点为D'
        >>> text_parser = Parser(text)
        >>> text_parser.description_list()
        >>> text_parser.error_flag
        1
        >>> text = '支持公式如$\\frac{y}{x}$，$\\SIFBlank$，$\\FigureID{1}$，不支持公式如$\\frac{ \\dddot y}{x}$'
        >>> text_parser = Parser(text)
        >>> text_parser.description_list()
        >>> text_parser.fomula_illegal_flag
        1
        """
        # print('call description_list')
        self.description()
        if self.error_flag:
            # print("Error")
            return
        if self.lookahead != self.empty:
            self.description_list()  # pragma: no cover
        else:
            self.error_flag = 0
            # print('parse successfully!')