# SPDX-License-Identifier: Apache-2.0
import warnings
from uuid import uuid4
from contextlib import contextmanager
from typing import Callable, Dict, List, Optional, Sequence, Set, Tuple, Union
import numpy as np
import sklearn.base
from .proto import get_latest_tested_opset_version
from .common._topology import convert_topology
from .common.utils_sklearn import _process_options
from ._parse import parse_sklearn_model
# Invoke the registration of all our converters and shape calculators.
from . import shape_calculators # noqa: F401
from . import operator_converters # noqa: F401
[docs]
def convert_sklearn(
model,
name=None,
initial_types=None,
doc_string="",
target_opset=None,
custom_conversion_functions=None,
custom_shape_calculators=None,
custom_parsers=None,
options=None,
intermediate=False,
white_op=None,
black_op=None,
final_types=None,
dtype=None,
naming=None,
model_optim=True,
verbose=0,
):
"""
This function produces an equivalent
ONNX model of the given scikit-learn model.
The supported converters is returned by function
:func:`supported_converters <skl2onnx.supported_converters>`.
For pipeline conversion, user needs to make sure each component
is one of our supported items.
This function converts the specified *scikit-learn* model
into its *ONNX* counterpart.
Note that for all conversions, initial types are required.
*ONNX* model name can also be specified.
:param model: A scikit-learn model
:param initial_types: a python list.
Each element is a tuple of a variable name
and a type defined in `data_types.py`
:param name: The name of the graph (type: GraphProto)
in the produced ONNX model (type: ModelProto)
:param doc_string: A string attached onto the produced ONNX model
:param target_opset: number, for example, 7 for
ONNX 1.2, and 8 for ONNX 1.3,
if value is not specified, the function will
choose the latest tested opset
(see :py:func:`skl2onnx.get_latest_tested_opset_version`)
:param custom_conversion_functions: a dictionary for
specifying the user customized conversion function,
it takes precedence over registered converters
:param custom_shape_calculators: a dictionary for
specifying the user customized shape calculator
it takes precedence over registered shape calculators.
:param custom_parsers: parsers determines which outputs
is expected for which particular task,
default parsers are defined for classifiers,
regressors, pipeline but they can be rewritten,
*custom_parsers* is a dictionary
``{ type: fct_parser(scope, model, inputs, custom_parsers=None) }``
:param options: specific options given to converters
(see :ref:`l-conv-options`)
:param intermediate: if True, the function returns the
converted model and the instance of :class:`Topology` used,
it returns the converted model otherwise
:param white_op: white list of ONNX nodes allowed
while converting a pipeline,
if empty, all are allowed
:param black_op: black list of ONNX nodes
allowed while converting a pipeline,
if empty, none are blacklisted
:param final_types: a python list. Works the same way as initial_types
but not mandatory, it is used to overwrites the type
(if type is not None) and the name of every output.
:param dtype: removed in version 1.7.5, dtype is
now inferred from input types,
converters may add operators Cast to switch
to double when it is necessary
:param naming: the user may want to change the way intermediate
are named, this parameter can be a string (a prefix) or a
function, which signature is the following:
`get_name(name, existing_names)`, the library will then
check this name is unique and modify it if not
:param model_optim: enable or disable model optimisation
after the model was converted into onnx, it reduces the number
of identity nodes
:param verbose: display progress while converting a model
:return: An ONNX model (type: ModelProto) which is
equivalent to the input scikit-learn model
Example of *initial_types*:
Assume that the specified *scikit-learn* model takes
a heterogeneous list as its input.
If the first 5 elements are floats and the last 10 elements are integers,
we need to specify initial types as below. The [None] in
[None, 5] indicates the batch size here is unknown.
::
from skl2onnx.common.data_types import FloatTensorType, Int64TensorType
initial_type = [('float_input', FloatTensorType([None, 5])),
('int64_input', Int64TensorType([None, 10]))]
.. note::
If a pipeline includes an instance of
`ColumnTransformer <https://scikit-learn.org/stable/modules/
generated/sklearn.compose.ColumnTransformer.html>`_,
*scikit-learn* allow the user to specify columns by names.
This option is not supported
by *sklearn-onnx* as features names could be different
in input data and the ONNX graph
(defined by parameter *initial_types*), only integers are supported.
Converters options
++++++++++++++++++
Some ONNX operators exposes parameters *sklearn-onnx* cannot
guess from the raw model. Some default values are usually suggested
but the users may have to manually overwrite them. This need
is not obvious to do when a model is included in a pipeline.
That's why these options can be given to function *convert_sklearn*
as a dictionary ``{model_type: parameters in a dictionary}`` or
``{model_id: parameters in a dictionary}``.
Option *sep* is used to specify the delimiters between two words
when the ONNX graph needs to tokenize a string.
The default value is short and may not include all
the necessary values. It can be overwritten as:
::
extra = {TfidfVectorizer: {"separators": [' ', '[.]', '\\\\?',
',', ';', ':', '\\\\!', '\\\\(', '\\\\)']}}
model_onnx = convert_sklearn(
model, "tfidf",
initial_types=[("input", StringTensorType([None, 1]))],
options=extra)
But if a pipeline contains two model of the same class,
it is possible to distinguish between the two with function *id*:
::
extra = {id(model): {"separators": [' ', '.', '\\\\?', ',', ';',
':', '\\\\!', '\\\\(', '\\\\)']}}
model_onnx = convert_sklearn(
pipeline, "pipeline-with-2-tfidf",
initial_types=[("input", StringTensorType([None, 1]))],
options=extra)
It is used in example :ref:`l-example-tfidfvectorizer`.
.. versionchanged:: 1.10.0
Parameter *naming* was added.
"""
if initial_types is None:
if hasattr(model, "infer_initial_types"):
initial_types = model.infer_initial_types()
else:
raise ValueError(
"Initial types are required. See usage of "
"convert(...) in skl2onnx.convert for details"
)
if name is None:
name = str(uuid4().hex)
if dtype is not None:
warnings.warn(
"Parameter dtype is no longer supported. It will be removed in 1.9.0.",
DeprecationWarning,
stacklevel=0,
)
target_opset = target_opset if target_opset else get_latest_tested_opset_version()
# Parse scikit-learn model as our internal data structure
# (i.e., Topology)
if verbose >= 1:
print("[convert_sklearn] parse_sklearn_model")
topology = parse_sklearn_model(
model,
initial_types,
target_opset,
custom_conversion_functions,
custom_shape_calculators,
custom_parsers,
options=options,
white_op=white_op,
black_op=black_op,
final_types=final_types,
naming=naming,
)
# Convert our Topology object into ONNX. The outcome is an ONNX model.
options = _process_options(model, options)
if verbose >= 1:
print("[convert_sklearn] convert_topology")
onnx_model = convert_topology(
topology,
name,
doc_string,
target_opset,
options=options,
remove_identity=model_optim and not intermediate,
verbose=verbose,
)
if verbose >= 1:
print("[convert_sklearn] end")
if verbose >= 2:
scope = topology.scopes[0]
print("---INPUTS---")
for inp in scope.input_variables:
print(" %r" % inp)
print("---OUTPUTS---")
for inp in scope.output_variables:
print(" %r" % inp)
print("---VARIABLES---")
for k, v in sorted(scope.variables.items()):
print(" %r: is.fed=%r is_leaf=%r - %r" % (k, v.is_fed, v.is_leaf, v))
print("---OPERATORS---")
for k, v in sorted(scope.operators.items()):
print(" %r: is.evaluated=%r - %r" % (k, v.is_evaluated, v))
return (onnx_model, topology) if intermediate else onnx_model
[docs]
def to_onnx(
model: sklearn.base.BaseEstimator,
X: Optional[np.array] = None,
name: Optional[str] = None,
initial_types: Optional[
List[Tuple[str, Sequence[Optional[Union[int, str]]]]]
] = None,
target_opset: Optional[Union[Dict[str, int], int]] = None,
options: Optional[Dict] = None,
white_op: Optional[Set[str]] = None,
black_op: Optional[Set[str]] = None,
final_types: Optional[List[Tuple[str, Sequence[Optional[Union[int, str]]]]]] = None,
dtype: Optional[np.dtype] = None,
naming: Optional[Callable] = None,
model_optim: bool = True,
verbose: int = 0,
):
"""
Calls :func:`convert_sklearn` with simplified parameters.
:param model: model to convert
:param X: training set, can be None, it is used to infered the
input types (*initial_types*)
:param initial_types: if X is None, then *initial_types* must be
defined
:param target_opset: conversion with a specific target opset
:param options: specific options given to converters
(see :ref:`l-conv-options`)
:param name: name of the model
:param white_op: white list of ONNX nodes allowed
while converting a pipeline, if empty, all are allowed
:param black_op: black list of ONNX nodes allowed
while converting a pipeline, if empty, none are blacklisted
:param final_types: a python list. Works the same way as initial_types
but not mandatory, it is used to overwrites the type
(if type is not None) and the name of every output.
:param dtype: removed in version 1.7.5, dtype is now inferred from
input types, converters may add operators Cast to switch to
double when it is necessary
:param naming: the user may want to change the way intermediate
are named, this parameter can be a string (a prefix) or a
function, which signature is the following:
`get_name(name, existing_names)`, the library will then
check this name is unique and modify it if not
:param model_optim: enable or disable model optimisation
after the model was converted into onnx, it reduces the number
of identity nodes
:param verbose: display progress while converting a model
:return: converted model
This function checks if the model inherits from class
:class:`OnnxOperatorMixin`, it calls method *to_onnx*
in that case otherwise it calls :func:`convert_sklearn`.
.. versionchanged:: 1.18.0
The main opset is now equal to target_opset and not a value equal or less
than the given value.
"""
from .algebra.onnx_operator_mixin import OnnxOperatorMixin
from .algebra.type_helper import guess_initial_types
if isinstance(model, OnnxOperatorMixin):
if options is not None:
raise NotImplementedError(
"options not yet implemented for OnnxOperatorMixin."
)
return model.to_onnx(X=X, name=name, target_opset=target_opset)
if name is None:
name = "ONNX(%s)" % model.__class__.__name__
initial_types = guess_initial_types(X, initial_types)
if verbose >= 1:
print("[to_onnx] initial_types=%r" % initial_types)
model = convert_sklearn(
model,
initial_types=initial_types,
target_opset=target_opset,
name=name,
options=options,
white_op=white_op,
black_op=black_op,
final_types=final_types,
dtype=dtype,
verbose=verbose,
naming=naming,
model_optim=model_optim,
)
new_target_model = None
for op in model.opset_import:
if op.domain == "":
new_target_model = op.version
break
expected_target = (
target_opset
if not isinstance(target_opset, dict)
else target_opset.get("", None)
)
if expected_target is not None and new_target_model != expected_target:
for op in model.opset_import:
if op.domain == "":
op.version = expected_target
break
return model
def wrap_as_onnx_mixin(model, target_opset=None):
"""
Combines a *scikit-learn* class with :class:`OnnxOperatorMixin`
which produces a new object which combines *scikit-learn* API
and *OnnxOperatorMixin* API.
"""
from .algebra.sklearn_ops import find_class
cl = find_class(model.__class__)
if "automation" in str(cl):
raise RuntimeError("Wrong class name '{}'.".format(cl))
state = model.__getstate__()
obj = object.__new__(cl)
obj.__setstate__(state)
obj.op_version = target_opset
return obj
@contextmanager
def may_switch_bases_classes_order(cls):
"""
XGBClassifier and XGBRegressor do not inherit from those classes
in the right order. We change it when it is registered to avoid
``AttributeError: 'super' object has no attribute '__sklearn_tags__'``
to be raised.
"""
bases = cls.__bases__
if len(bases) == 2 and bases[1] in (
sklearn.base.ClassifierMixin,
sklearn.base.RegressorMixin,
):
cls.__bases__ = bases[1], bases[0]
revert = True
else:
revert = False
try:
yield
finally:
if revert:
cls.__bases__ = bases