# coding: utf-8
# 2021/5/16 @ tongshiwei
import traceback
import warnings
from .segment import seg
from .tokenization import tokenize, link_formulas
from .parser import Parser
__all__ = ["is_sif", "to_sif", "sif4sci"]
[docs]def is_sif(item, check_formula=True, return_parser=False):
r"""
the part aims to check whether the input is sif format
Parameters
----------
item:str
a raw item which respects stem
check_formula: bool
whether to check the formulas when parsing item.
True if check the validity of formulas in item
False if not check the validity of formulas in item, which is faster
return_parser: bool
whether to put the parsed item in return.
when True, the format of return is (bool, Parser)
when False, the format of return is bool
Returns
-------
bool
when item can not be parsed correctly, raise ValueError;
when item is in stardarded format originally, return Ture (and the Parser of item);
when item isn't in stardarded format originally, return False (and the Parser of item);
Examples
--------
>>> text = '若$x,y$满足约束条件' \
... '$\\left\\{\\begin{array}{c}2 x+y-2 \\leq 0 \\\\ x-y-1 \\geq 0 \\\\ y+1 \\geq 0\\end{array}\\right.$,' \
... '则$z=x+7 y$的最大值$\\SIFUnderline$'
>>> is_sif(text)
True
>>> text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'
>>> ret = is_sif(text, return_parser=True)
>>> ret # doctest: +ELLIPSIS
(False, <EduNLP.SIF.parser.parser.Parser object...>)
"""
item_parser = Parser(item, check_formula)
item_parser.description_list()
if item_parser.fomula_illegal_flag:
raise ValueError(item_parser.fomula_illegal_message)
ret = True if item_parser.error_flag == 0 and item_parser.modify_flag == 0 else False
if return_parser is True:
return ret, item_parser
else:
return ret
[docs]def to_sif(item, check_formula=True, parser: Parser = None):
r"""
the part aims to switch item to sif formate
Parameters
----------
items:str
a raw item which respects stem
check_formula: bool
whether to check the formulas when parsing item (only work when parser=None).
parser: Parser
the parser of item returned from is_sif.
Returns
-------
item:str
the item which accords with sif format
Examples
--------
>>> text = '某校一个课外学习小组为研究某作物的发芽率y和温度x(单位...'
>>> siftext = to_sif(text)
>>> siftext
'某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位...'
>>> ret = is_sif(text, return_parser=True)
>>> ret # doctest: +ELLIPSIS
(False, <EduNLP.SIF.parser.parser.Parser object...>)
>>> to_sif(text, parser=ret[1])
'某校一个课外学习小组为研究某作物的发芽率$y$和温度$x$(单位...
"""
if parser is not None:
return parser.text
else:
return is_sif(item, check_formula, return_parser=True)[1].text
[docs]def sif4sci(item: str, figures: (dict, bool) = None, mode: int = 2, symbol: str = None, tokenization=True,
tokenization_params=None, errors="raise"):
r"""
Default to use linear Tokenizer, change the tokenizer by specifying tokenization_params
Parameters
----------
item:str
a raw item which respects stem
figures:dict
when it is a dict, it means the id-to-instance for figures in 'FormFigureID{...}' format,
when it is a bool, it means whether to instantiate figures in 'FormFigureBase64{...}' format
mode: int
when safe = 2, use is_sif and check formula in item
when safe = 1, use is_sif but don't check formula in item
when safe = 0, don't use is_sif and don't check anything in item
symbol: str
select the methods to symbolize:
"t": text
"f": formula
"g": figure
"m": question mark
"a": tag
"s": sep
tokenization: bool
whether to tokenize item after segmentation
tokenization_params:
the dict of text_params, formula_params and figure_params in tokenization
For formula_params:
method: which tokenizer to be used, "linear" or "ast"
The parameters only useful for "linear":
skip_figure_formula: whether to skip the formula in figure format
symbolize_figure_formula: whether to symbolize the formula in figure format
The parameters only useful for "ast":
ord2token: whether to transfer the variables (mathord) and constants (textord) to special tokens.
var_numbering: whether to use number suffix to denote different variables
return_type: 'list' or 'ast'
More parameters can be found in the definition in SIF.tokenization.formula
For figure_params:
figure_instance:whether to return instance of figures in tokens
For text_params: See definition in SIF.tokenization.text
granularity: word or char
stopwords: default or None or list
errors:
warn,
raise,
coerce,
strict,
ignore
Returns
-------
list
When tokenization is False, return SegmentList;
When tokenization is True, return TokenList
Examples
--------
>>> test_item = r"如图所示,则$\bigtriangleup ABC$的面积是$\SIFBlank$。$\FigureID{1}$"
>>> tl = sif4sci(test_item)
>>> tl
['如图所示', '\\bigtriangleup', 'ABC', '面积', '\\SIFBlank', \FigureID{1}]
>>> tl.describe()
{'t': 2, 'f': 2, 'g': 1, 'm': 1}
>>> with tl.filter('fgm'):
... tl
['如图所示', '面积']
>>> with tl.filter(keep='t'):
... tl
['如图所示', '面积']
>>> with tl.filter():
... tl
['如图所示', '\\bigtriangleup', 'ABC', '面积', '\\SIFBlank', \FigureID{1}]
>>> tl.text_tokens
['如图所示', '面积']
>>> tl.formula_tokens
['\\bigtriangleup', 'ABC']
>>> tl.figure_tokens
[\FigureID{1}]
>>> tl.ques_mark_tokens
['\\SIFBlank']
>>> sif4sci(test_item, symbol="gm", tokenization_params={"formula_params": {"method": "ast"}})
['如图所示', <Formula: \bigtriangleup ABC>, '面积', '[MARK]', '[FIGURE]']
>>> sif4sci(test_item, symbol="tfgm")
['[TEXT]', '[FORMULA]', '[TEXT]', '[MARK]', '[TEXT]', '[FIGURE]']
>>> sif4sci(test_item, symbol="gm",
... tokenization_params={"formula_params": {"method": "ast", "return_type": "list"}})
['如图所示', '\\bigtriangleup', 'A', 'B', 'C', '面积', '[MARK]', '[FIGURE]']
>>> test_item_1 = {
... "stem": r"若$x=2$, $y=\sqrt{x}$,则下列说法正确的是$\SIFChoice$",
... "options": [r"$x < y$", r"$y = x$", r"$y < x$"]
... }
>>> tls = [
... sif4sci(e, symbol="gm",
... tokenization_params={
... "formula_params": {
... "method": "ast", "return_type": "list", "ord2token": True, "var_numbering": True,
... "link_variable": False}
... })
... for e in ([test_item_1["stem"]] + test_item_1["options"])
... ]
>>> tls[1:]
[['mathord_0', '<', 'mathord_1'], ['mathord_0', '=', 'mathord_1'], ['mathord_0', '<', 'mathord_1']]
>>> link_formulas(*tls)
>>> tls[1:]
[['mathord_0', '<', 'mathord_1'], ['mathord_1', '=', 'mathord_0'], ['mathord_1', '<', 'mathord_0']]
>>> from EduNLP.utils import dict2str4sif
>>> test_item_1_str = dict2str4sif(test_item_1, tag_mode="head", add_list_no_tag=False)
>>> test_item_1_str # doctest: +ELLIPSIS
'$\\SIFTag{stem}$...则下列说法正确的是$\\SIFChoice$$\\SIFTag{options}$$x < y$$\\SIFSep$$y = x$$\\SIFSep$$y < x$'
>>> tl1 = sif4sci(test_item_1_str, symbol="gm",
... tokenization_params={"formula_params": {"method": "ast", "return_type": "list", "ord2token": True}})
>>> tl1.get_segments()[0]
['\\SIFTag{stem}']
>>> tl1.get_segments()[1:3]
[['[TEXT_BEGIN]', '[TEXT_END]'], ['[FORMULA_BEGIN]', 'mathord', '=', 'textord', '[FORMULA_END]']]
>>> tl1.get_segments(add_seg_type=False)[0:3]
[['\\SIFTag{stem}'], ['mathord', '=', 'textord'], ['mathord', '=', 'mathord', '{ }', '\\sqrt']]
>>> test_item_2 = {"options": [r"$x < y$", r"$y = x$", r"$y < x$"]}
>>> test_item_2
{'options': ['$x < y$', '$y = x$', '$y < x$']}
>>> test_item_2_str = dict2str4sif(test_item_2, tag_mode="head", add_list_no_tag=False)
>>> test_item_2_str
'$\\SIFTag{options}$$x < y$$\\SIFSep$$y = x$$\\SIFSep$$y < x$'
>>> tl2 = sif4sci(test_item_2_str, symbol="gms",
... tokenization_params={"formula_params": {"method": "ast", "return_type": "list"}})
>>> tl2
['\\SIFTag{options}', 'x', '<', 'y', '[SEP]', 'y', '=', 'x', '[SEP]', 'y', '<', 'x']
>>> tl2.get_segments(add_seg_type=False)
[['\\SIFTag{options}'], ['x', '<', 'y'], ['[SEP]'], ['y', '=', 'x'], ['[SEP]'], ['y', '<', 'x']]
>>> tl2.get_segments(add_seg_type=False, drop="s")
[['\\SIFTag{options}'], ['x', '<', 'y'], ['y', '=', 'x'], ['y', '<', 'x']]
>>> tl3 = sif4sci(test_item_1["stem"], symbol="gs")
>>> tl3.text_segments
[['说法', '正确']]
>>> tl3.formula_segments
[['x', '=', '2'], ['y', '=', '\\sqrt', '{', 'x', '}']]
>>> tl3.figure_segments
[]
>>> tl3.ques_mark_segments
[['\\SIFChoice']]
>>> test_item_3 = r"已知$y=x$,则以下说法中$\textf{正确,b}$的是"
>>> tl4 = sif4sci(test_item_3)
Warning: there is some chinese characters in formula!
>>> tl4.text_segments
[['已知'], ['说法', '中', '正确']]
"""
try:
if mode in [1, 2]:
check_formula = True if mode == 1 else False
sif, item_parser = is_sif(item, check_formula=check_formula, return_parser=True)
if sif is not True:
item = to_sif(item, parser=item_parser)
elif mode != 0:
raise KeyError(
"Unknown mode %s, use only 0 or 1 or 2." % mode
)
ret = seg(item, figures, symbol)
if tokenization is True:
ret = tokenize(ret, **(tokenization_params if tokenization_params is not None else {}))
return ret
except Exception as e: # pragma: no cover
msg = traceback.format_exc()
if errors == "warn":
warnings.warn(msg)
elif errors == "raise":
raise e