Dealing with discrepancies (tf-idf)

TfidfVectorizer is one transform for which the corresponding converted onnx model may produce different results. The larger the vocabulary is, the higher the probability to get different result is. This example proposes a equivalent model with no discrepancies.

Imports, setups

All imports. It also registered onnx converters for xgboost and lightgbm.

import pprint
import numpy
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from onnxruntime import InferenceSession
from skl2onnx import to_onnx


def print_sparse_matrix(m):
    nonan = numpy.nan_to_num(m)
    mi, ma = nonan.min(), nonan.max()
    if mi == ma:
        ma += 1
    mat = numpy.empty(m.shape, dtype=numpy.str_)
    mat[:, :] = "."
    if hasattr(m, "todense"):
        dense = m.todense()
    else:
        dense = m
    for i in range(m.shape[0]):
        for j in range(m.shape[1]):
            if dense[i, j] > 0:
                c = int((dense[i, j] - mi) / (ma - mi) * 25)
                mat[i, j] = chr(ord("A") + c)
    return "\n".join("".join(line) for line in mat)


def diff(a, b):
    if a.shape != b.shape:
        raise ValueError(
            f"Cannot compare matrices with different shapes {a.shape} != {b.shape}."
        )
    d = numpy.abs(a - b).sum() / a.size
    return d

Artificial datasets

Iris + a text column.

strings = numpy.array(
    [
        "This a sentence.",
        "This a sentence with more characters $^*&'(-...",
        """var = ClassName(var2, user=mail@anywhere.com, pwd"""
        """=")_~-('&]@^\\`|[{#")""",
        "c79857654",
        "https://complex-url.com/;76543u3456?g=hhh&h=23",
        "01-03-05T11:12:13",
        "https://complex-url.com/;dd76543u3456?g=ddhhh&h=23",
    ]
).reshape((-1, 1))

pprint.pprint(strings)
array([['This a sentence.'],
       ["This a sentence with more characters $^*&'(-..."],
       ['var = ClassName(var2, user=mail@anywhere.com, pwd=")_~-(\'&]@^\\`|[{#")'],
       ['c79857654'],
       ['https://complex-url.com/;76543u3456?g=hhh&h=23'],
       ['01-03-05T11:12:13'],
       ['https://complex-url.com/;dd76543u3456?g=ddhhh&h=23']],
      dtype='<U69')

Fit a TfIdfVectorizer

tfidf = Pipeline([("pre", ColumnTransformer([("tfidf", TfidfVectorizer(), 0)]))])

We leave a couple of strings out of the training set.

tfidf.fit(strings[:-2])
tr = tfidf.transform(strings)
tfidf_step = tfidf.steps[0][1].transformers_[0][1]
# print(f"output columns: {tfidf_step.get_feature_names_out()}")
print("rendered outputs")
print(print_sparse_matrix(tr))
rendered outputs
..............RR.....
.....M......M.JJ....M
...J..JH...J.J...JJJ.
....Z................
JJJ....HJJJ.....J....
.....................
K.K....IK.K.....K....

Conversion to ONNX

onx = to_onnx(tfidf, strings)

Execution with ONNX

sess = InferenceSession(onx.SerializeToString(), providers=["CPUExecutionProvider"])
got = sess.run(None, {"X": strings})[0]
print(f"differences={diff(tr, got):g}")
print(print_sparse_matrix(got))
differences=3.25823e-08
..............RR.....
.....M......M.JJ....M
...J..JH...J.J...JJJ.
....Z................
JJJ....HJJJ.....J....
.....................
K.K....IK.K.....K....

Total running time of the script: (0 minutes 0.039 seconds)

Gallery generated by Sphinx-Gallery