Note
Go to the end to download the full example code.
Benchmark a pipeline¶
The following example checks up on every step in a pipeline, compares and benchmarks the predictions.
Create a pipeline¶
We reuse the pipeline implemented in example
Pipelining: chaining a PCA and a logistic regression.
There is one change because
ONNX-ML Imputer
does not handle string type. This cannot be part of the final ONNX pipeline
and must be removed. Look for comment starting with ---
below.
import skl2onnx
import onnx
import sklearn
import numpy
from skl2onnx.helpers import collect_intermediate_steps
from timeit import timeit
from skl2onnx.helpers import compare_objects
import onnxruntime as rt
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import convert_sklearn
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
logistic = LogisticRegression()
pca = PCA()
pipe = Pipeline(steps=[("pca", pca), ("logistic", logistic)])
digits = datasets.load_digits()
X_digits = digits.data[:1000]
y_digits = digits.target[:1000]
pipe.fit(X_digits, y_digits)
Conversion to ONNX¶
initial_types = [("input", FloatTensorType((None, X_digits.shape[1])))]
model_onnx = convert_sklearn(pipe, initial_types=initial_types, target_opset=12)
sess = rt.InferenceSession(
model_onnx.SerializeToString(), providers=["CPUExecutionProvider"]
)
print("skl predict_proba")
print(pipe.predict_proba(X_digits[:2]))
onx_pred = sess.run(None, {"input": X_digits[:2].astype(np.float32)})[1]
df = pd.DataFrame(onx_pred)
print("onnx predict_proba")
print(df.values)
skl predict_proba
[[9.99998530e-01 7.81608913e-19 4.87445968e-10 1.79842280e-08
3.58700551e-10 1.18138028e-06 4.14411048e-08 1.48275025e-07
2.50162849e-08 5.51240030e-08]
[1.37889361e-14 9.99999324e-01 9.17867432e-11 8.30390362e-13
2.57277808e-07 8.84035057e-12 5.11781442e-11 2.83346412e-11
4.18965302e-07 1.32796357e-13]]
onnx predict_proba
[[9.99998569e-01 7.81611026e-19 4.87444585e-10 1.79842026e-08
3.58700042e-10 1.18137802e-06 4.14409520e-08 1.48274751e-07
2.50162131e-08 5.51239410e-08]
[1.37888807e-14 9.99999344e-01 9.17865159e-11 8.30387679e-13
2.57277748e-07 8.84032951e-12 5.11779785e-11 2.83345725e-11
4.18964021e-07 1.32796280e-13]]
Comparing outputs¶
compare_objects(pipe.predict_proba(X_digits[:2]), onx_pred)
# No exception so they are the same.
Benchmarks¶
scikit-learn
3.638581280000153
onnxruntime
0.17826826999998957
Intermediate steps¶
Let’s imagine the final output is wrong and we need to look into each component of the pipeline which one is failing. The following method modifies the scikit-learn pipeline to steal the intermediate outputs and produces an smaller ONNX graph for every operator.
steps = collect_intermediate_steps(pipe, "pipeline", initial_types)
assert len(steps) == 2
pipe.predict_proba(X_digits[:2])
for _i, step in enumerate(steps):
onnx_step = step["onnx_step"]
sess = rt.InferenceSession(
onnx_step.SerializeToString(), providers=["CPUExecutionProvider"]
)
onnx_outputs = sess.run(None, {"input": X_digits[:2].astype(np.float32)})
skl_outputs = step["model"]._debug.outputs
if "transform" in skl_outputs:
compare_objects(skl_outputs["transform"], onnx_outputs[0])
print("benchmark", step["model"].__class__)
print("scikit-learn")
print(
timeit(
"step['model'].transform(X_digits[:1])", number=10000, globals=globals()
)
)
else:
compare_objects(skl_outputs["predict_proba"], onnx_outputs[1])
print("benchmark", step["model"].__class__)
print("scikit-learn")
print(
timeit(
"step['model'].predict_proba(X_digits[:1])",
number=10000,
globals=globals(),
)
)
print("onnxruntime")
print(
timeit(
"sess.run(None, {'input': X_digits[:1].astype(np.float32)})",
number=10000,
globals=globals(),
)
)
benchmark <class 'sklearn.decomposition._pca.PCA'>
scikit-learn
0.9553307900000618
onnxruntime
0.11795157300002757
benchmark <class 'sklearn.linear_model._logistic.LogisticRegression'>
scikit-learn
1.2479175040000428
onnxruntime
0.14069998299987674
Versions used for this example
print("numpy:", numpy.__version__)
print("scikit-learn:", sklearn.__version__)
print("onnx: ", onnx.__version__)
print("onnxruntime: ", rt.__version__)
print("skl2onnx: ", skl2onnx.__version__)
numpy: 2.3.1
scikit-learn: 1.6.1
onnx: 1.19.0
onnxruntime: 1.23.0
skl2onnx: 1.19.1
Total running time of the script: (0 minutes 6.501 seconds)