# SPDX-License-Identifier: Apache-2.0
import warnings
from uuid import uuid4
from typing import Callable, Dict, List, Optional, Sequence, Set, Tuple, Union
import numpy as np
import sklearn.base
from .proto import get_latest_tested_opset_version
from .common._topology import convert_topology
from .common.utils_sklearn import _process_options
from ._parse import parse_sklearn_model
# Invoke the registration of all our converters and shape calculators.
from . import shape_calculators # noqa: F401
from . import operator_converters # noqa: F401
[docs]
def convert_sklearn(
model,
name=None,
initial_types=None,
doc_string="",
target_opset=None,
custom_conversion_functions=None,
custom_shape_calculators=None,
custom_parsers=None,
options=None,
intermediate=False,
white_op=None,
black_op=None,
final_types=None,
dtype=None,
naming=None,
model_optim=True,
verbose=0,
):
"""
This function produces an equivalent
ONNX model of the given scikit-learn model.
The supported converters is returned by function
:func:`supported_converters <skl2onnx.supported_converters>`.
For pipeline conversion, user needs to make sure each component
is one of our supported items.
This function converts the specified *scikit-learn* model
into its *ONNX* counterpart.
Note that for all conversions, initial types are required.
*ONNX* model name can also be specified.
:param model: A scikit-learn model
:param initial_types: a python list.
Each element is a tuple of a variable name
and a type defined in `data_types.py`
:param name: The name of the graph (type: GraphProto)
in the produced ONNX model (type: ModelProto)
:param doc_string: A string attached onto the produced ONNX model
:param target_opset: number, for example, 7 for
ONNX 1.2, and 8 for ONNX 1.3,
if value is not specified, the function will
choose the latest tested opset
(see :py:func:`skl2onnx.get_latest_tested_opset_version`)
:param custom_conversion_functions: a dictionary for
specifying the user customized conversion function,
it takes precedence over registered converters
:param custom_shape_calculators: a dictionary for
specifying the user customized shape calculator
it takes precedence over registered shape calculators.
:param custom_parsers: parsers determines which outputs
is expected for which particular task,
default parsers are defined for classifiers,
regressors, pipeline but they can be rewritten,
*custom_parsers* is a dictionary
``{ type: fct_parser(scope, model, inputs, custom_parsers=None) }``
:param options: specific options given to converters
(see :ref:`l-conv-options`)
:param intermediate: if True, the function returns the
converted model and the instance of :class:`Topology` used,
it returns the converted model otherwise
:param white_op: white list of ONNX nodes allowed
while converting a pipeline,
if empty, all are allowed
:param black_op: black list of ONNX nodes
allowed while converting a pipeline,
if empty, none are blacklisted
:param final_types: a python list. Works the same way as initial_types
but not mandatory, it is used to overwrites the type
(if type is not None) and the name of every output.
:param dtype: removed in version 1.7.5, dtype is
now inferred from input types,
converters may add operators Cast to switch
to double when it is necessary
:param naming: the user may want to change the way intermediate
are named, this parameter can be a string (a prefix) or a
function, which signature is the following:
`get_name(name, existing_names)`, the library will then
check this name is unique and modify it if not
:param model_optim: enable or disable model optimisation
after the model was converted into onnx, it reduces the number
of identity nodes
:param verbose: display progress while converting a model
:return: An ONNX model (type: ModelProto) which is
equivalent to the input scikit-learn model
Example of *initial_types*:
Assume that the specified *scikit-learn* model takes
a heterogeneous list as its input.
If the first 5 elements are floats and the last 10 elements are integers,
we need to specify initial types as below. The [None] in
[None, 5] indicates the batch size here is unknown.
::
from skl2onnx.common.data_types import FloatTensorType, Int64TensorType
initial_type = [('float_input', FloatTensorType([None, 5])),
('int64_input', Int64TensorType([None, 10]))]
.. note::
If a pipeline includes an instance of
`ColumnTransformer <https://scikit-learn.org/stable/modules/
generated/sklearn.compose.ColumnTransformer.html>`_,
*scikit-learn* allow the user to specify columns by names.
This option is not supported
by *sklearn-onnx* as features names could be different
in input data and the ONNX graph
(defined by parameter *initial_types*), only integers are supported.
Converters options
++++++++++++++++++
Some ONNX operators exposes parameters *sklearn-onnx* cannot
guess from the raw model. Some default values are usually suggested
but the users may have to manually overwrite them. This need
is not obvious to do when a model is included in a pipeline.
That's why these options can be given to function *convert_sklearn*
as a dictionary ``{model_type: parameters in a dictionary}`` or
``{model_id: parameters in a dictionary}``.
Option *sep* is used to specify the delimiters between two words
when the ONNX graph needs to tokenize a string.
The default value is short and may not include all
the necessary values. It can be overwritten as:
::
extra = {TfidfVectorizer: {"separators": [' ', '[.]', '\\\\?',
',', ';', ':', '\\\\!', '\\\\(', '\\\\)']}}
model_onnx = convert_sklearn(
model, "tfidf",
initial_types=[("input", StringTensorType([None, 1]))],
options=extra)
But if a pipeline contains two model of the same class,
it is possible to distinguish between the two with function *id*:
::
extra = {id(model): {"separators": [' ', '.', '\\\\?', ',', ';',
':', '\\\\!', '\\\\(', '\\\\)']}}
model_onnx = convert_sklearn(
pipeline, "pipeline-with-2-tfidf",
initial_types=[("input", StringTensorType([None, 1]))],
options=extra)
It is used in example :ref:`l-example-tfidfvectorizer`.
.. versionchanged:: 1.10.0
Parameter *naming* was added.
"""
if initial_types is None:
if hasattr(model, "infer_initial_types"):
initial_types = model.infer_initial_types()
else:
raise ValueError(
"Initial types are required. See usage of "
"convert(...) in skl2onnx.convert for details"
)
if name is None:
name = str(uuid4().hex)
if dtype is not None:
warnings.warn(
"Parameter dtype is no longer supported. It will be removed in 1.9.0.",
DeprecationWarning,
stacklevel=0,
)
target_opset = target_opset if target_opset else get_latest_tested_opset_version()
# Parse scikit-learn model as our internal data structure
# (i.e., Topology)
if verbose >= 1:
print("[convert_sklearn] parse_sklearn_model")
topology = parse_sklearn_model(
model,
initial_types,
target_opset,
custom_conversion_functions,
custom_shape_calculators,
custom_parsers,
options=options,
white_op=white_op,
black_op=black_op,
final_types=final_types,
naming=naming,
)
# Convert our Topology object into ONNX. The outcome is an ONNX model.
options = _process_options(model, options)
if verbose >= 1:
print("[convert_sklearn] convert_topology")
onnx_model = convert_topology(
topology,
name,
doc_string,
target_opset,
options=options,
remove_identity=model_optim and not intermediate,
verbose=verbose,
)
if verbose >= 1:
print("[convert_sklearn] end")
if verbose >= 2:
scope = topology.scopes[0]
print("---INPUTS---")
for inp in scope.input_variables:
print(" %r" % inp)
print("---OUTPUTS---")
for inp in scope.output_variables:
print(" %r" % inp)
print("---VARIABLES---")
for k, v in sorted(scope.variables.items()):
print(" %r: is.fed=%r is_leaf=%r - %r" % (k, v.is_fed, v.is_leaf, v))
print("---OPERATORS---")
for k, v in sorted(scope.operators.items()):
print(" %r: is.evaluated=%r - %r" % (k, v.is_evaluated, v))
return (onnx_model, topology) if intermediate else onnx_model
[docs]
def to_onnx(
model: sklearn.base.BaseEstimator,
X: Optional[np.array] = None,
name: Optional[str] = None,
initial_types: Optional[
List[Tuple[str, Sequence[Optional[Union[int, str]]]]]
] = None,
target_opset: Optional[Union[Dict[str, int], int]] = None,
options: Optional[Dict] = None,
white_op: Optional[Set[str]] = None,
black_op: Optional[Set[str]] = None,
final_types: Optional[List[Tuple[str, Sequence[Optional[Union[int, str]]]]]] = None,
dtype: Optional[np.dtype] = None,
naming: Optional[Callable] = None,
model_optim: bool = True,
verbose: int = 0,
):
"""
Calls :func:`convert_sklearn` with simplified parameters.
:param model: model to convert
:param X: training set, can be None, it is used to infered the
input types (*initial_types*)
:param initial_types: if X is None, then *initial_types* must be
defined
:param target_opset: conversion with a specific target opset
:param options: specific options given to converters
(see :ref:`l-conv-options`)
:param name: name of the model
:param white_op: white list of ONNX nodes allowed
while converting a pipeline, if empty, all are allowed
:param black_op: black list of ONNX nodes allowed
while converting a pipeline, if empty, none are blacklisted
:param final_types: a python list. Works the same way as initial_types
but not mandatory, it is used to overwrites the type
(if type is not None) and the name of every output.
:param dtype: removed in version 1.7.5, dtype is now inferred from
input types, converters may add operators Cast to switch to
double when it is necessary
:param naming: the user may want to change the way intermediate
are named, this parameter can be a string (a prefix) or a
function, which signature is the following:
`get_name(name, existing_names)`, the library will then
check this name is unique and modify it if not
:param model_optim: enable or disable model optimisation
after the model was converted into onnx, it reduces the number
of identity nodes
:param verbose: display progress while converting a model
:return: converted model
This function checks if the model inherits from class
:class:`OnnxOperatorMixin`, it calls method *to_onnx*
in that case otherwise it calls :func:`convert_sklearn`.
.. versionchanged:: 1.10.0
Parameter *naming* was added.
.. versionchanged:: 1.18.0
The main opset is now equal to target_opset and not a value equal or less
than the given value.
"""
from .algebra.onnx_operator_mixin import OnnxOperatorMixin
from .algebra.type_helper import guess_initial_types
if isinstance(model, OnnxOperatorMixin):
if options is not None:
raise NotImplementedError(
"options not yet implemented for OnnxOperatorMixin."
)
return model.to_onnx(X=X, name=name, target_opset=target_opset)
if name is None:
name = "ONNX(%s)" % model.__class__.__name__
initial_types = guess_initial_types(X, initial_types)
if verbose >= 1:
print("[to_onnx] initial_types=%r" % initial_types)
model = convert_sklearn(
model,
initial_types=initial_types,
target_opset=target_opset,
name=name,
options=options,
white_op=white_op,
black_op=black_op,
final_types=final_types,
dtype=dtype,
verbose=verbose,
naming=naming,
model_optim=model_optim,
)
new_target_model = None
for op in model.opset_import:
if op.domain == "":
new_target_model = op.version
break
expected_target = (
target_opset
if not isinstance(target_opset, dict)
else target_opset.get("", None)
)
if expected_target is not None and new_target_model != expected_target:
for op in model.opset_import:
if op.domain == "":
op.version = expected_target
break
return model
def wrap_as_onnx_mixin(model, target_opset=None):
"""
Combines a *scikit-learn* class with :class:`OnnxOperatorMixin`
which produces a new object which combines *scikit-learn* API
and *OnnxOperatorMixin* API.
"""
from .algebra.sklearn_ops import find_class
cl = find_class(model.__class__)
if "automation" in str(cl):
raise RuntimeError("Wrong class name '{}'.".format(cl))
state = model.__getstate__()
obj = object.__new__(cl)
obj.__setstate__(state)
obj.op_version = target_opset
return obj