Source code for skl2onnx.operator_converters.text_vectoriser

# SPDX-License-Identifier: Apache-2.0


import warnings
from collections import OrderedDict, Counter
import numpy as np
from ..common._apply_operation import apply_cast, apply_reshape, apply_identity
from ..common._registration import register_converter
from ..common._topology import Scope, Operator
from ..common._container import ModelComponentContainer
from ..common.data_types import guess_proto_type, StringTensorType
from ..proto import onnx_proto
from ..algebra.onnx_ops import OnnxStringNormalizer


def _intelligent_split(text, op, tokenizer, existing):
    """
    Splits text into tokens. *scikit-learn*
    merges tokens with ``' '.join(tokens)``
    to name ngrams. ``'a  b'`` could be ``('a ', 'b')``
    or ``('a', ' b')``.
    See `ngram sequence
    <https://github.com/scikit-learn/scikit-learn/blob/main/
    sklearn/feature_extraction/text.py#L169>`_.
    """
    if op.analyzer == "word":
        if op.ngram_range[0] == op.ngram_range[1] == 1:
            spl = [text]
        elif op.ngram_range[0] == 1 and len(text) >= 2:
            # Every element is in the vocabulary.
            # Naive method
            p1 = len(text) - len(text.lstrip())
            p2_ = len(text) - len(text.rstrip())
            if p2_ == 0:
                p2 = len(text)
            else:
                p2 = -p2_
            spl = text[p1:p2].split()
            if len(spl) <= 1:
                spl = [text]
            else:
                spl[0] = " " * p1 + spl[0]
                spl[-1] = spl[-1] + " " * p2_
            exc = None
            if len(spl) == 1:
                pass
            elif len(spl) == 2:
                if spl[0] not in op.vocabulary_ or spl[1] not in op.vocabulary_:
                    # This is neceassarily a single token.
                    spl = [text]
                elif spl[0] in op.vocabulary_ and spl[1] in op.vocabulary_:
                    # ambiguity
                    # w1, w2 can be either a 2-grams, either a token.
                    # Usually, ' ' is not part of any token.
                    pass
            elif len(spl) == 3:
                stok = (all([s in op.vocabulary_ for s in spl]), spl)
                spl12 = (
                    spl[2] in op.vocabulary_
                    and (spl[0] + " " + spl[1]) in op.vocabulary_,
                    [spl[0] + " " + spl[1], spl[2]],
                )
                spl23 = (
                    spl[0] in op.vocabulary_
                    and (spl[1] + " " + spl[2]) in op.vocabulary_,
                    [spl[0], spl[1] + " " + spl[2]],
                )
                c = Counter(map(lambda t: t[0], [stok, spl12, spl23]))
                if c.get(True, -1) == 0:
                    spl = [text]
                found = [el[1] for el in [stok, spl12, spl23] if el[0]]
                if len(found) == 1:
                    spl = found[0]
                elif len(found) == 0:
                    spl = [text]
                elif stok[0]:
                    # By default, we assume the token is just the sum of
                    # single words.
                    pass
                else:
                    exc = (
                        "More than one decomposition in tokens: ["
                        + ", ".join(map(lambda t: "-".join(t), found))
                        + "]."
                    )
            elif any(map(lambda g: g in op.vocabulary_, spl)):
                # TODO: handle this case with an algorithm
                # which is able to break a string into
                # known substrings.
                exc = "Unable to identify tokens in n-grams."
            if exc:
                raise RuntimeError(
                    "Unable to split n-grams '{}' into tokens. "
                    "{} This happens when a token contain "
                    "spaces. Token '{}' may be a token or a n-gram '{}'."
                    "".format(text, exc, text, spl)
                )
        else:
            # We reuse the tokenizer hoping that will clear
            # ambiguities but this might be slow.
            spl = tokenizer(text)
    else:
        spl = list(text)

    spl = tuple(spl)
    if spl in existing:
        raise RuntimeError(
            f"The converter cannot guess how to split expression "
            f"{text!r} into tokens. This case happens when tokens have "
            f"spaces."
        )
    if op.ngram_range[0] == 1 and (len(op.ngram_range) == 1 or op.ngram_range[1] > 1):
        # All grams should be existing in the vocabulary.
        for g in spl:
            if g not in op.vocabulary_:
                raise RuntimeError(
                    "Unable to split n-grams '{}' into tokens {} "
                    "existing in the vocabulary. Token '{}' does not "
                    "exist in the vocabulary."
                    ".".format(text, spl, g)
                )
    existing.add(spl)
    return spl


[docs] def convert_sklearn_text_vectorizer( scope: Scope, operator: Operator, container: ModelComponentContainer ): """ Converters for class `TfidfVectorizer <https://scikit-learn.org/stable/modules/generated/ sklearn.feature_extraction.text.TfidfVectorizer.html>`_. The current implementation is a work in progress and the ONNX version does not produce the exact same results. The converter lets the user change some of its parameters. Additional options ------------------ tokenexp: string The default will change to true in version 1.6.0. The tokenizer splits into words using this regular expression or the regular expression specified by *scikit-learn* is the value is an empty string. See also note below. Default value: None separators: list of separators These separators are used to split a string into words. Options *separators* is ignore if options *tokenexp* is not None. Default value: ``[' ', '[.]', '\\\\?', ',', ';', ':', '\\\\!']``. locale: The locale is not mentioned in scikit-object. This option can be used to change the value for parameter `locale` of ONNX operator `StringNormalizer`. Example (from :ref:`l-example-tfidfvectorizer`): :: seps = {TfidfVectorizer: {"separators": [' ', '[.]', '\\\\?', ',', ';', ':', '!', '\\\\(', '\\\\)', '\\n', '\\\\"', "'", "-", "\\\\[", "\\\\]", "@"]}} model_onnx = convert_sklearn(pipeline, "tfidf", initial_types=[("input", StringTensorType([None, 2]))], options=seps) The default regular expression of the tokenizer is ``(?u)\\\\b\\\\w\\\\w+\\\\b`` (see `re <https://docs.python.org/3/library/re.html>`_). This expression may not supported by the library handling the backend. `onnxruntime <https://github.com/Microsoft/onnxruntime>`_ uses `re2 <https://github.com/google/re2>`_. You may need to switch to a custom tokenizer based on `python wrapper for re2 <https://pypi.org/project/re2/>`_ or its sources `pyre2 <https://github.com/facebook/pyre2>`_ (`syntax <https://github.com/google/re2/blob/main/doc/syntax.txt>`_). If the regular expression is not specified and if the instance of TfidfVectorizer is using the default pattern ``(?u)\\\\b\\\\w\\\\w+\\\\b``, it is replaced by ``[a-zA-Z0-9_]+``. Any other case has to be manually handled. Regular expression ``[^\\\\\\\\n]`` is used to split a sentance into character (and not works) if ``analyser=='char'``. The mode ``analyser=='char_wb'`` is not implemented. """ # noqa op = operator.raw_operator if container.target_opset is not None and container.target_opset < 9: raise RuntimeError( "Converter for '{}' only works for opset >= 9." "".format(op.__class__.__name__) ) if op.analyzer == "char_wb": raise NotImplementedError( "CountVectorizer cannot be converted, " "only tokenizer='word' is fully supported. " "You may raise an issue at " "https://github.com/onnx/sklearn-onnx/issues." ) if op.analyzer == "char": warnings.warn( "The conversion of CountVectorizer may not work. " "only tokenizer='word' is fully supported. " "You may raise an issue at " "https://github.com/onnx/sklearn-onnx/issues.", UserWarning, ) if op.strip_accents is not None: raise NotImplementedError( "CountVectorizer cannot be converted, " "only strip_accents=None is supported. " "You may raise an issue at " "https://github.com/onnx/sklearn-onnx/issues." ) options = container.get_options( op, dict( separators="DEFAULT", tokenexp=None, nan=False, keep_empty_string=False, locale=None, ), ) if set(options) != {"separators", "tokenexp", "nan", "keep_empty_string", "locale"}: raise RuntimeError( "Unknown option {} for {}".format(set(options) - {"separators"}, type(op)) ) if op.analyzer == "word": default_pattern = "(?u)\\b\\w\\w+\\b" if options["separators"] == "DEFAULT" and options["tokenexp"] is None: regex = op.token_pattern if regex == default_pattern: regex = "[a-zA-Z0-9_]+" default_separators = None elif options["tokenexp"] is not None: if options["tokenexp"]: regex = options["tokenexp"] else: regex = op.token_pattern if regex == default_pattern: regex = "[a-zA-Z0-9_]+" default_separators = None else: regex = None default_separators = options["separators"] else: if options["separators"] != "DEFAULT": raise RuntimeError( "Option separators has no effect " "if analyser != 'word'." ) regex = options["tokenexp"] if options["tokenexp"] else "." default_separators = None if op.preprocessor is not None: raise NotImplementedError( "Custom preprocessor cannot be converted into ONNX. " "You may raise an issue at " "https://github.com/onnx/sklearn-onnx/issues." ) if op.tokenizer is not None: raise NotImplementedError( "Custom tokenizer cannot be converted into ONNX. " "You may raise an issue at " "https://github.com/onnx/sklearn-onnx/issues." ) if op.strip_accents is not None: raise NotImplementedError( "Operator StringNormalizer cannot remove accents. " "You may raise an issue at " "https://github.com/onnx/sklearn-onnx/issues." ) if hasattr(op, "stop_words_"): stop_words = op.stop_words_ | (set(op.stop_words) if op.stop_words else set()) else: stop_words = set() for w in stop_words: if not isinstance(w, str): raise TypeError( f"One stop word is not a string {w!r} in stop_words={stop_words}." ) if op.lowercase or stop_words: if len(operator.input_full_names) != 1: raise RuntimeError( "Only one input is allowed, found {}.".format(operator.input_full_names) ) # StringNormalizer op_type = "StringNormalizer" attrs = {"name": scope.get_unique_operator_name(op_type)} normalized = scope.get_unique_variable_name("normalized") if container.target_opset >= 10: attrs.update( { "case_change_action": "LOWER", "is_case_sensitive": not op.lowercase, } ) op_version = 10 domain = "" else: attrs.update( { "casechangeaction": "LOWER", "is_case_sensitive": not op.lowercase, } ) op_version = 9 domain = "com.microsoft" if options["locale"] is not None: attrs["locale"] = options["locale"] opvs = 1 if domain == "com.microsoft" else op_version if stop_words: attrs["stopwords"] = list(sorted(stop_words)) if options["keep_empty_string"]: del attrs["name"] op_norm = OnnxStringNormalizer( "text_in", op_version=container.target_opset, output_names=["text_out"], **attrs, ) scan_body = op_norm.to_onnx( OrderedDict([("text_in", StringTensorType())]), outputs=[("text_out", StringTensorType())], target_opset=op_version, ) vector = scope.get_unique_variable_name("vector") apply_reshape( scope, operator.input_full_names[0], vector, container, desired_shape=(-1, 1), ) container.add_node( "Scan", vector, normalized, body=scan_body.graph, num_scan_inputs=1 ) else: flatten = scope.get_unique_variable_name("flattened") apply_reshape( scope, operator.input_full_names[0], flatten, container, desired_shape=(-1,), ) container.add_node( op_type, flatten, normalized, op_version=opvs, op_domain=domain, **attrs ) else: normalized = operator.input_full_names # Tokenizer padvalue = "#" while padvalue in op.vocabulary_: padvalue += "#" op_type = "Tokenizer" attrs = {"name": scope.get_unique_operator_name(op_type)} attrs.update( { "pad_value": padvalue, "mark": False, "mincharnum": 1, } ) if regex is None: attrs["separators"] = default_separators else: attrs["tokenexp"] = regex tokenized = scope.get_unique_variable_name("tokenized") container.add_node( op_type, normalized, tokenized, op_domain="com.microsoft", **attrs ) # Flatten # Tokenizer outputs shape {1, C} or {1, 1, C}. # Second shape is not allowed by TfIdfVectorizer. # We use Flatten which produces {1, C} in both cases. flatt_tokenized = scope.get_unique_variable_name("flattened") container.add_node( "Flatten", tokenized, flatt_tokenized, name=scope.get_unique_operator_name("Flatten"), ) tokenized = flatt_tokenized # Ngram - TfIdfVectorizer C = max(op.vocabulary_.values()) + 1 words = [None for i in range(C)] weights = [0 for i in range(C)] for k, v in op.vocabulary_.items(): words[v] = k weights[v] = 1.0 mode = "TF" # Scikit-learn sorts n-grams by alphabetical order.. # onnx assumes it is sorted by n. tokenizer = op.build_tokenizer() split_words = [] existing = set() errors = [] for w in words: if isinstance(w, tuple): # TraceableCountVectorizer, TraceableTfIdfVectorizer spl = list(w) w = " ".join(w) else: # CountVectorizer, TfIdfVectorizer try: spl = _intelligent_split(w, op, tokenizer, existing) except RuntimeError as e: errors.append(e) continue split_words.append((spl, w)) if len(errors) > 0: err = "\n".join(map(str, errors)) raise RuntimeError( f"There were ambiguities between n-grams and tokens. " f"{len(errors)} errors occurred. You can fix it by using " f"class Traceable{op.__class__.__name__}.\n" f"You can learn more at https://github.com/scikit-learn/" f"scikit-learn/issues/13733.\n{err}" ) ng_split_words = sorted([(len(a[0]), a[0], i) for i, a in enumerate(split_words)]) key_indices = [a[2] for a in ng_split_words] ngcounts = [0 for i in range(op.ngram_range[0])] words = list(ng_split_words[0][1]) for i in range(1, len(ng_split_words)): if ng_split_words[i - 1][0] != ng_split_words[i][0]: ngcounts.append(len(words)) words.extend(ng_split_words[i][1]) weights_ = [weights[a[2]] for a in ng_split_words] weights = list(weights_) for i, ind in enumerate(key_indices): weights[ind] = weights_[i] # Create the node. attrs = {"name": scope.get_unique_operator_name("TfIdfVectorizer")} attrs.update( { "min_gram_length": op.ngram_range[0], "max_gram_length": op.ngram_range[1], "mode": mode, "max_skip_count": 0, "pool_strings": words, "ngram_indexes": key_indices, "ngram_counts": ngcounts, "weights": list(map(np.float32, weights)), } ) output = scope.get_unique_variable_name("output") proto_dtype = guess_proto_type(operator.inputs[0].type) if proto_dtype != onnx_proto.TensorProto.DOUBLE: proto_dtype = onnx_proto.TensorProto.FLOAT if proto_dtype == onnx_proto.TensorProto.DOUBLE: output_tf = scope.get_unique_variable_name("cast_result") else: output_tf = output if container.target_opset < 9: op_type = "Ngram" container.add_node( op_type, tokenized, output_tf, op_domain="com.microsoft", **attrs ) else: op_type = "TfIdfVectorizer" container.add_node( op_type, tokenized, output_tf, op_domain="", op_version=9, **attrs ) if proto_dtype == onnx_proto.TensorProto.DOUBLE: apply_cast(scope, output_tf, output, container, to=proto_dtype) if op.binary: cast_result_name = scope.get_unique_variable_name("cast_result") output_name = scope.get_unique_variable_name("output_name") apply_cast( scope, output, cast_result_name, container, to=onnx_proto.TensorProto.BOOL ) apply_cast( scope, cast_result_name, output_name, container, to=onnx_proto.TensorProto.FLOAT, ) output = output_name options = container.get_options(op, dict(nan=False)) replace_by_nan = options.get("nan", False) if replace_by_nan: # This part replaces all null values by nan. cst_nan_name = scope.get_unique_variable_name("nan_name") container.add_initializer(cst_nan_name, proto_dtype, [1], [np.nan]) cst_zero_name = scope.get_unique_variable_name("zero_name") container.add_initializer(cst_zero_name, proto_dtype, [1], [0]) mask_name = scope.get_unique_variable_name("mask_name") container.add_node( "Equal", [output, cst_zero_name], mask_name, name=scope.get_unique_operator_name("Equal"), ) where_name = scope.get_unique_variable_name("where_name") container.add_node( "Where", [mask_name, cst_nan_name, output], where_name, name=scope.get_unique_operator_name("Where"), ) output = where_name apply_identity(scope, output, operator.output_full_names, container)
register_converter( "SklearnCountVectorizer", convert_sklearn_text_vectorizer, options={ "tokenexp": None, "separators": None, "nan": [True, False], "keep_empty_string": [True, False], "locale": None, }, )