Source code for EduNLP.SIF.sif

# coding: utf-8
# 2021/5/16 @ tongshiwei

import traceback
import warnings
from .segment import seg
from .tokenization import tokenize, link_formulas
from .parser import Parser

__all__ = ["is_sif", "to_sif", "sif4sci"]


[docs]def is_sif(item, check_formula=True, return_parser=False):
    r"""
    the part aims to check whether the input is sif format

    Parameters
    ----------
    item:str
        a raw item which respects stem
    check_formula: bool
        whether to check the formulas when parsing item.

        True if check the validity of formulas in item
        False if not check the validity of formulas in item, which is faster
    return_parser: bool
        whether to put the parsed item in return.

        when True, the format of return is (bool, Parser)
        when False, the format of return is bool

    Returns
    -------
    bool
        when item can not be parsed correctly, raise ValueError;
        when item is in stardarded format originally, return Ture (and the Parser of item);
        when item isn't in stardarded format originally, return False (and the Parser of item);

    Examples
    --------
    >>> text = '若$x,y$满足约束条件' \
    ...        '$\\left\\{\\begin{array}{c}2 x+y-2 \\leq 0 \\\\ x-y-1 \\geq 0 \\\\ y+1 \\geq 0\\end{array}\\right.$，' \
    ...        '则$z=x+7 y$的最大值$\\SIFUnderline$'
    >>> is_sif(text)
    True
    >>> text = '某校一个课外学习小组为研究某作物的发芽率y和温度x（单位...'
    >>> ret = is_sif(text, return_parser=True)
    >>> ret # doctest: +ELLIPSIS
    (False, <EduNLP.SIF.parser.parser.Parser object...>)
    """
    item_parser = Parser(item, check_formula)
    item_parser.description_list()
    if item_parser.fomula_illegal_flag:
        raise ValueError(item_parser.fomula_illegal_message)
    ret = True if item_parser.error_flag == 0 and item_parser.modify_flag == 0 else False
    if return_parser is True:
        return ret, item_parser
    else:
        return ret


[docs]def to_sif(item, check_formula=True, parser: Parser = None):
    r"""
    the part aims to switch item to sif formate

    Parameters
    ----------
    items:str
        a raw item which respects stem
    check_formula: bool
        whether to check the formulas when parsing item (only work when parser=None).
    parser: Parser
        the parser of item returned from is_sif.

    Returns
    -------
    item:str
        the item which accords with sif format

    Examples
    --------
    >>> text = '某校一个课外学习小组为研究某作物的发芽率y和温度x（单位...'
    >>> siftext = to_sif(text)
    >>> siftext
    '某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$（单位...'
    >>> ret = is_sif(text, return_parser=True)
    >>> ret # doctest: +ELLIPSIS
    (False, <EduNLP.SIF.parser.parser.Parser object...>)
    >>> to_sif(text, parser=ret[1])
    '某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$（单位...

    """
    if parser is not None:
        return parser.text
    else:
        return is_sif(item, check_formula, return_parser=True)[1].text


[docs]def sif4sci(item: str, figures: (dict, bool) = None, mode: int = 2, symbol: str = None, tokenization=True,
            tokenization_params=None, errors="raise"):
    r"""

    Default to use linear Tokenizer, change the tokenizer by specifying tokenization_params

    Parameters
    ----------
    item:str
        a raw item which respects stem
    figures:dict
        when it is a dict, it means the id-to-instance for figures in 'FormFigureID{...}' format,
        when it is a bool, it means whether to instantiate figures in 'FormFigureBase64{...}' format

    mode: int
        when safe = 2, use is_sif and check formula in item
        when safe = 1, use is_sif but don't check formula in item
        when safe = 0, don't use is_sif and don't check anything in item

    symbol: str
        select the methods to symbolize:
            "t": text
            "f": formula
            "g": figure
            "m": question mark
            "a": tag
            "s": sep

    tokenization: bool
        whether to tokenize item after segmentation

    tokenization_params:
        the dict of text_params, formula_params and figure_params in tokenization
        For formula_params:
            method: which tokenizer to be used, "linear" or "ast"
            The parameters only useful for "linear":
                skip_figure_formula: whether to skip the formula in figure format
                symbolize_figure_formula: whether to symbolize the formula in figure format
            The parameters only useful for "ast":
                ord2token: whether to transfer the variables (mathord) and constants (textord) to special tokens.
                var_numbering: whether to use number suffix to denote different variables
                return_type: 'list' or 'ast'
            More parameters can be found in the definition in SIF.tokenization.formula
        For figure_params:
            figure_instance：whether to return instance of figures in tokens
        For text_params: See definition in SIF.tokenization.text
            granularity: word or char
            stopwords: default or None or list

    errors:
        warn,
        raise,
        coerce,
        strict,
        ignore

    Returns
    -------
    list
        When tokenization is False, return SegmentList;
        When tokenization is True, return TokenList

    Examples
    --------
    >>> test_item = r"如图所示，则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$"
    >>> tl = sif4sci(test_item)
    >>> tl
    ['如图所示', '\\bigtriangleup', 'ABC', '面积', '\\SIFBlank', \FigureID{1}]
    >>> tl.describe()
    {'t': 2, 'f': 2, 'g': 1, 'm': 1}
    >>> with tl.filter('fgm'):
    ...     tl
    ['如图所示', '面积']
    >>> with tl.filter(keep='t'):
    ...     tl
    ['如图所示', '面积']
    >>> with tl.filter():
    ...     tl
    ['如图所示', '\\bigtriangleup', 'ABC', '面积', '\\SIFBlank', \FigureID{1}]
    >>> tl.text_tokens
    ['如图所示', '面积']
    >>> tl.formula_tokens
    ['\\bigtriangleup', 'ABC']
    >>> tl.figure_tokens
    [\FigureID{1}]
    >>> tl.ques_mark_tokens
    ['\\SIFBlank']
    >>> sif4sci(test_item, symbol="gm", tokenization_params={"formula_params": {"method": "ast"}})
    ['如图所示', <Formula: \bigtriangleup ABC>, '面积', '[MARK]', '[FIGURE]']
    >>> sif4sci(test_item, symbol="tfgm")
    ['[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[TEXT]', '[FIGURE]']
    >>> sif4sci(test_item, symbol="gm",
    ... tokenization_params={"formula_params": {"method": "ast", "return_type": "list"}})
    ['如图所示', '\\bigtriangleup', 'A', 'B', 'C', '面积', '[MARK]', '[FIGURE]']
    >>> test_item_1 = {
    ...     "stem": r"若$x=2$, $y=\sqrt{x}$，则下列说法正确的是$\SIFChoice$",
    ...     "options": [r"$x < y$", r"$y = x$", r"$y < x$"]
    ... }
    >>> tls = [
    ...     sif4sci(e, symbol="gm",
    ...     tokenization_params={
    ...         "formula_params": {
    ...             "method": "ast", "return_type": "list", "ord2token": True, "var_numbering": True,
    ...             "link_variable": False}
    ...     })
    ...     for e in ([test_item_1["stem"]] + test_item_1["options"])
    ... ]
    >>> tls[1:]
    [['mathord_0', '<', 'mathord_1'], ['mathord_0', '=', 'mathord_1'], ['mathord_0', '<', 'mathord_1']]
    >>> link_formulas(*tls)
    >>> tls[1:]
    [['mathord_0', '<', 'mathord_1'], ['mathord_1', '=', 'mathord_0'], ['mathord_1', '<', 'mathord_0']]
    >>> from EduNLP.utils import dict2str4sif
    >>> test_item_1_str = dict2str4sif(test_item_1, tag_mode="head", add_list_no_tag=False)
    >>> test_item_1_str  # doctest: +ELLIPSIS
    '$\\SIFTag{stem}$...则下列说法正确的是$\\SIFChoice$$\\SIFTag{options}$$x < y$$\\SIFSep$$y = x$$\\SIFSep$$y < x$'
    >>> tl1 = sif4sci(test_item_1_str, symbol="gm",
    ... tokenization_params={"formula_params": {"method": "ast", "return_type": "list", "ord2token": True}})
    >>> tl1.get_segments()[0]
    ['\\SIFTag{stem}']
    >>> tl1.get_segments()[1:3]
    [['[TEXT_BEGIN]', '[TEXT_END]'], ['[FORMULA_BEGIN]', 'mathord', '=', 'textord', '[FORMULA_END]']]
    >>> tl1.get_segments(add_seg_type=False)[0:3]
    [['\\SIFTag{stem}'], ['mathord', '=', 'textord'], ['mathord', '=', 'mathord', '{ }', '\\sqrt']]
    >>> test_item_2 = {"options": [r"$x < y$", r"$y = x$", r"$y < x$"]}
    >>> test_item_2
    {'options': ['$x < y$', '$y = x$', '$y < x$']}
    >>> test_item_2_str = dict2str4sif(test_item_2, tag_mode="head", add_list_no_tag=False)
    >>> test_item_2_str
    '$\\SIFTag{options}$$x < y$$\\SIFSep$$y = x$$\\SIFSep$$y < x$'
    >>> tl2 = sif4sci(test_item_2_str, symbol="gms",
    ... tokenization_params={"formula_params": {"method": "ast", "return_type": "list"}})
    >>> tl2
    ['\\SIFTag{options}', 'x', '<', 'y', '[SEP]', 'y', '=', 'x', '[SEP]', 'y', '<', 'x']
    >>> tl2.get_segments(add_seg_type=False)
    [['\\SIFTag{options}'], ['x', '<', 'y'], ['[SEP]'], ['y', '=', 'x'], ['[SEP]'], ['y', '<', 'x']]
    >>> tl2.get_segments(add_seg_type=False, drop="s")
    [['\\SIFTag{options}'], ['x', '<', 'y'], ['y', '=', 'x'], ['y', '<', 'x']]
    >>> tl3 = sif4sci(test_item_1["stem"], symbol="gs")
    >>> tl3.text_segments
    [['说法', '正确']]
    >>> tl3.formula_segments
    [['x', '=', '2'], ['y', '=', '\\sqrt', '{', 'x', '}']]
    >>> tl3.figure_segments
    []
    >>> tl3.ques_mark_segments
    [['\\SIFChoice']]
    >>> test_item_3 = r"已知$y=x$，则以下说法中$\textf{正确,b}$的是"
    >>> tl4 = sif4sci(test_item_3)
    Warning: there is some chinese characters in formula!
    >>> tl4.text_segments
    [['已知'], ['说法', '中', '正确']]
    """
    try:
        if mode in [1, 2]:
            check_formula = True if mode == 1 else False
            sif, item_parser = is_sif(item, check_formula=check_formula, return_parser=True)
            if sif is not True:
                item = to_sif(item, parser=item_parser)
        elif mode != 0:
            raise KeyError(
                "Unknown mode %s, use only 0 or 1 or 2." % mode
            )

        ret = seg(item, figures, symbol)

        if tokenization is True:
            ret = tokenize(ret, **(tokenization_params if tokenization_params is not None else {}))

        return ret
    except Exception as e:  # pragma: no cover
        msg = traceback.format_exc()
        if errors == "warn":
            warnings.warn(msg)
        elif errors == "raise":
            raise e