Note
Go to the end to download the full example code
Converter for pyod.models.iforest.IForest#
This example answers issues 685. It implements a custom converter for model pyod.models.iforest.IForest. This example uses Implement a new converter as a start.
Trains a model#
All imports. It also registered onnx converters for xgboost and lightgbm.
import numpy as np
import pandas as pd
from onnxruntime import InferenceSession
from sklearn.preprocessing import MinMaxScaler
from skl2onnx.proto import onnx_proto
from skl2onnx.common.data_types import (
FloatTensorType,
Int64TensorType,
guess_numpy_type,
)
from skl2onnx import to_onnx, update_registered_converter, get_model_alias
from skl2onnx.algebra.onnx_ops import (
OnnxIdentity,
OnnxMul,
OnnxLess,
OnnxConcat,
OnnxCast,
OnnxAdd,
OnnxClip,
)
from skl2onnx.algebra.onnx_operator import OnnxSubEstimator
try:
from pyod.models.iforest import IForest
except (ValueError, ImportError) as e:
print("Unable to import pyod:", e)
IForest = None
if IForest is not None:
data1 = {
"First": [500, 500, 400, 100, 200, 300, 100],
"Second": ["a", "b", "a", "b", "a", "b", "c"],
}
df1 = pd.DataFrame(data1, columns=["First", "Second"])
dumdf1 = pd.get_dummies(df1)
scaler = MinMaxScaler()
scaler.partial_fit(dumdf1)
sc_data = scaler.transform(dumdf1)
model1 = IForest(
n_estimators=10,
bootstrap=True,
behaviour="new",
contamination=0.1,
random_state=np.random.RandomState(42),
verbose=1,
n_jobs=-1,
).fit(sc_data)
feature_names2 = dumdf1.columns
initial_type = [("float_input", FloatTensorType([None, len(feature_names2)]))]
Unable to import pyod: No module named 'pyod'
We check that the conversion fails as expected.
if IForest is not None:
try:
to_onnx(model1, initial_types=initial_type)
except Exception as e:
print(e)
Custom converter#
First the parser and the shape calculator. The parser defines the number of outputs and their type. The shape calculator defines their dimensions.
def pyod_iforest_parser(scope, model, inputs, custom_parsers=None):
alias = get_model_alias(type(model))
this_operator = scope.declare_local_operator(alias, model)
# inputs
this_operator.inputs.append(inputs[0])
# outputs
cls_type = inputs[0].type.__class__
val_y1 = scope.declare_local_variable("label", Int64TensorType())
val_y2 = scope.declare_local_variable("probability", cls_type())
this_operator.outputs.append(val_y1)
this_operator.outputs.append(val_y2)
# end
return this_operator.outputs
def pyod_iforest_shape_calculator(operator):
N = operator.inputs[0].get_first_dimension()
operator.outputs[0].type.shape = [N, 1]
operator.outputs[1].type.shape = [N, 2]
Then the converter.
def pyod_iforest_converter(scope, operator, container):
op = operator.raw_operator
opv = container.target_opset
out = operator.outputs
# We retrieve the unique input.
X = operator.inputs[0]
# In most case, computation happen in floats.
# But it might be with double. ONNX is very strict
# about types, every constant should have the same
# type as the input.
dtype = guess_numpy_type(X.type)
detector = op.detector_ # Should be IForest from scikit-learn.
lab_pred = OnnxSubEstimator(detector, X, op_version=opv)
scores = OnnxIdentity(lab_pred[1], op_version=opv)
# labels
threshold = op.threshold_
above = OnnxLess(scores, np.array([threshold], dtype=dtype), op_version=opv)
labels = OnnxCast(
above, op_version=opv, to=onnx_proto.TensorProto.INT64, output_names=out[:1]
)
# probabilities
train_scores = op.decision_scores_
scaler = MinMaxScaler().fit(train_scores.reshape(-1, 1))
scores_ = OnnxMul(scores, np.array([-1], dtype=dtype), op_version=opv)
print(scaler.min_)
print(scaler.scale_)
scaled = OnnxMul(scores_, scaler.scale_.astype(dtype), op_version=opv)
scaled_centered = OnnxAdd(scaled, scaler.min_.astype(dtype), op_version=opv)
clipped = OnnxClip(
scaled_centered,
np.array([0], dtype=dtype),
np.array([1], dtype=dtype),
op_version=opv,
)
clipped_ = OnnxAdd(
OnnxMul(clipped, np.array([-1], dtype=dtype), op_version=opv),
np.array([1], dtype=dtype),
op_version=opv,
)
scores_2d = OnnxConcat(
clipped_, clipped, axis=1, op_version=opv, output_names=out[1:]
)
labels.add_to(scope, container)
scores_2d.add_to(scope, container)
Finally the registration.
if IForest is not None:
update_registered_converter(
IForest,
"PyodIForest",
pyod_iforest_shape_calculator,
pyod_iforest_converter,
parser=pyod_iforest_parser,
)
And the conversion.
if IForest is not None:
onx = to_onnx(
model1, initial_types=initial_type, target_opset={"": 14, "ai.onnx.ml": 2}
)
Checking discrepencies#
if IForest is not None:
data = sc_data.astype(np.float32)
expected_labels = model1.predict(data)
expected_proba = model1.predict_proba(data)
sess = InferenceSession(onx.SerializeToString(), providers=["CPUExecutionProvider"])
res = sess.run(None, {"float_input": data})
onx_labels = res[0]
onx_proba = res[1]
diff_labels = np.abs(onx_labels.ravel() - expected_labels.ravel()).max()
diff_proba = np.abs(onx_proba.ravel() - expected_proba.ravel()).max()
print("dicrepencies:", diff_labels, diff_proba)
print("ONNX labels", onx_labels)
print("ONNX probabilities", onx_proba)
Total running time of the script: (0 minutes 0.008 seconds)