Examples
Examples of Tracking using popular ML libraries
All the examples are also available on Github.
sklearn
The following example demonstrates how to train a Random Forest Regressor using sklearn
and TrackingClient.
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor
from oip_tracking_client.tracking import TrackingClient
from sklearn.metrics import mean_squared_error, r2_score
# set up TrackingClient
api_host = "<API_HOST>"
api_key = "<API_KEY>"
workspace_name = "WORKSPACE_NAME"
TrackingClient.connect(api_host, api_key, workspace_name)
# set the experiment
experiment_name = "experiment_name"
TrackingClient.set_experiment(experiment_name)
# Load the dataset and split it into training and testing sets.
db = load_diabetes()
X_train, X_test, y_train, y_test = train_test_split(db.data, db.target)
TrackingClient.enable_system_metrics_logging()
with TrackingClient.start_run():
TrackingClient.autolog()
# Set the run name
TrackingClient.set_run_name("YOUR_RUN_NAME")
# Create and train the RandomForestRegressor model.
rf = RandomForestRegressor(n_estimators=10, max_depth=6, max_features=3)
rf.fit(X_train, y_train)
# Make predictions on the test set
y_pred = rf.predict(X_test)
# Calculate and log Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
TrackingClient.log_metric("mse", mse)
# Calculate and log R-squared
r2 = r2_score(y_test, y_pred)
TrackingClient.log_metric("r2", r2)
signature = TrackingClient.infer_signature(X_train, y_train)
TrackingClient.sklearn.log_model(rf, "model", signature=signature)
Upon training the model with TrackingClient the following MLmodel YAML file is generated, which stores essential information about the trained model and its version.
artifact_path: model
flavors:
python_function:
env:
conda: conda.yaml
virtualenv: python_env.yaml
loader_module: mlflow.sklearn
model_path: model.pkl
predict_fn: predict
python_version: 3.9.7
sklearn:
code: null
pickled_model: model.pkl
serialization_format: cloudpickle
sklearn_version: 1.0.2
mlflow_version: 2.4.0
model_uuid: e456b488a05942b58f173cfec10cdc42
run_id: 2fcfd26d59ad41f7aad7e6f487c56dd3
signature:
inputs: '[{"type": "tensor", "tensor-spec": {"dtype": "float64", "shape": [-1, 10]}}]'
outputs: '[{"type": "tensor", "tensor-spec": {"dtype": "float64", "shape": [-1]}}]'
utc_time_created: '2023-11-24 09:52:24.567632'
xgboost
The following example demonstrates how to train an XGBoost model using xgboost
library and TrackingClient.
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes
from oip_tracking_client.tracking import TrackingClient
# set up TrackingClient
api_host = "<API_HOST>"
api_key = "<API_KEY>"
workspace_name = "WORKSPACE_NAME"
TrackingClient.connect(api_host, api_key, workspace_name)
# set the experiment
experiment_name = "experiment_name"
TrackingClient.set_experiment(experiment_name)
# Load the dataset and split it into training and testing sets.
db = load_diabetes()
X_train, X_test, y_train, y_test = train_test_split(db.data, db.target)
TrackingClient.enable_system_metrics_logging()
with TrackingClient.start_run():
TrackingClient.autolog()
# Set the run name
TrackingClient.set_run_name("YOUR_RUN_NAME")
# Create and train the XGBoost model.
params = {
"n_estimators": 100,
"max_depth": 6,
"learning_rate": 0.1,
"objective": "reg:squarederror",
}
xgb_model = xgb.XGBRegressor(**params)
xgb_model.fit(X_train, y_train)
signature = TrackingClient.infer_signature(X_train, y_train)
TrackingClient.xgboost.log_model(xgb_model, "model", signature=signature)
After training the XGBoost model with TrackingClient, the following MLmodel YAML file is generated:
artifact_path: model
flavors:
python_function:
data: model.xgb
env:
conda: conda.yaml
virtualenv: python_env.yaml
loader_module: mlflow.xgboost
python_version: 3.9.7
xgboost:
code: null
data: model.xgb
model_class: xgboost.sklearn.XGBRegressor
model_format: xgb
xgb_version: 1.6.2
mlflow_version: 2.4.0
model_uuid: 1b41fa0786e34aedb2db835a43a71a05
run_id: 56159271f40f4351976391ff0bdadd77
signature:
inputs: '[{"type": "tensor", "tensor-spec": {"dtype": "float64", "shape": [-1, 10]}}]'
outputs: '[{"type": "tensor", "tensor-spec": {"dtype": "float64", "shape": [-1]}}]'
utc_time_created: '2023-11-24 10:24:57.517327'
statsmodels
The following example demonstrates how to train a SARIMAX model using statsmodels
and TrackingClient.
import numpy as np
import statsmodels.api as sm
from oip_tracking_client.tracking import TrackingClient
# set up TrackingClient
api_host = "<API_HOST>"
api_key = "<API_KEY>"
workspace_name = "WORKSPACE_NAME"
TrackingClient.connect(api_host, api_key, workspace_name)
# set the experiment
experiment_name = "experiment_name"
TrackingClient.set_experiment(experiment_name)
# Generate sample data with shape (-1, 10) for 10 features and a single output
np.random.seed(42)
num_samples = 100
num_features = 10
X = np.random.rand(num_samples, num_features)
# Generating the response variable as a linear combination of the features with some noise
true_coefficients = np.random.rand(num_features)
noise = np.random.normal(loc=0, scale=0.1, size=num_samples)
y = np.dot(X, true_coefficients) + noise
# Add a constant term to the independent variable (intercept)
X = sm.add_constant(X)
TrackingClient.enable_system_metrics_logging()
with TrackingClient.start_run():
TrackingClient.autolog()
# Set the run name
TrackingClient.set_run_name("YOUR_RUN_NAME")
# Create a linear regression model
model = sm.OLS(y, X)
# Fit the model to the data
model_fit = model.fit()
# Log the model using mlflow.statsmodels.log_model
signature = TrackingClient.infer_signature(X, y)
TrackingClient.statsmodels.log_model(model_fit, "model", signature=signature)
After training the SARIMAX model with TrackingClient, the following MLmodel YAML file is generated:
artifact_path: model
flavors:
python_function:
data: model.statsmodels
env:
conda: conda.yaml
virtualenv: python_env.yaml
loader_module: mlflow.statsmodels
python_version: 3.9.7
statsmodels:
code: null
data: model.statsmodels
statsmodels_version: 0.13.2
mlflow_version: 2.4.0
model_uuid: 2e70fe54dc224704ac0870af09487bbc
run_id: 3d0b08ff27314c759400580083e98c4f
signature:
inputs: '[{"type": "tensor", "tensor-spec": {"dtype": "float64", "shape": [-1, 11]}}]'
outputs: '[{"type": "tensor", "tensor-spec": {"dtype": "float64", "shape": [-1]}}]'
utc_time_created: '2023-11-24 10:22:09.899267'
keras
Keras is a high-level neural networks API, written in Python, that allows for easy and fast prototyping and experimentation. You can use Keras in conjunction with TrackingClient to track and manage experiments for model training and inference.
import numpy as np
from tensorflow import keras
from tensorflow.keras.datasets import reuters
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.preprocessing.text import Tokenizer
from oip_tracking_client.tracking import TrackingClient
# set up TrackingClient
api_host = "<API_HOST>"
api_key = "<API_KEY>"
workspace_name = "WORKSPACE_NAME"
TrackingClient.connect(api_host, api_key, workspace_name)
# set the experiment
experiment_name = "expr001"
TrackingClient.set_experiment(experiment_name)
max_words = 50
batch_size = 32
epochs = 4
# Loading data
(x_train, y_train), (x_test, y_test) = reuters.load_data(
num_words=max_words, test_split=0.2
)
num_classes = np.max(y_train) + 1
# Vectorizing sequence data
tokenizer = Tokenizer(num_words=max_words)
x_train = tokenizer.sequences_to_matrix(x_train, mode="binary")
x_test = tokenizer.sequences_to_matrix(x_test, mode="binary")
# Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
TrackingClient.enable_system_metrics_logging()
with TrackingClient.start_run():
TrackingClient.tensorflow.autolog()
# Set the run name
TrackingClient.set_run_name("YOUR_RUN_NAME")
# Building model
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation("relu"))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation("softmax"))
model.compile(
loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)
model.fit(
x_train,
y_train,
batch_size=batch_size,
epochs=epochs,
verbose=1,
validation_split=0.1,
)
# Log the model using mlflow.statsmodels.log_model
signature = TrackingClient.infer_signature(x_train, y_train)
TrackingClient.keras.log_model(model, "model", signature=signature)
After running the above code, TrackingClient will automatically log all relevant metrics, parameters, and artifacts (such as the trained model) during the training process.
After training the Keras model with TrackingClient, the following MLmodel YAML file is generated:
artifact_path: model
flavors:
python_function:
data: data
env:
conda: conda.yaml
virtualenv: python_env.yaml
loader_module: mlflow.tensorflow
python_version: 3.9.7
tensorflow:
code: null
data: data
keras_version: 2.8.0
model_type: keras
save_format: tf
mlflow_version: 2.4.0
model_uuid: 82aa11563c54406bb1bafc76feafac5a
run_id: 56ccd94b83fa4d83865f0b8cb3644131
signature:
inputs: '[{"type": "tensor", "tensor-spec": {"dtype": "float64", "shape": [-1, 50]}}]'
outputs: '[{"type": "tensor", "tensor-spec": {"dtype": "float32", "shape": [-1,
46]}}]'
utc_time_created: '2023-11-24 10:42:44.701726'
Transformers
This section demonstrates how to perform sentiment analysis using the transformers library and log the model with TrackingClient. We'll use a pre-trained BERT model for sentiment analysis.
import torch
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
from oip_tracking_client.tracking import TrackingClient
# set up TrackingClient
api_host = "<API_HOST>"
api_key = "<API_KEY>"
workspace_name = "WORKSPACE_NAME"
TrackingClient.connect(api_host, api_key, workspace_name)
# set the experiment
experiment_name = "expr001"
TrackingClient.set_experiment(experiment_name)
# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)
sentiment_analysis = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
# Define a sample movie review
review = "I really enjoyed watching this movie."
# Tokenize the review and convert to tensor
inputs = tokenizer(
review, return_tensors="pt", padding=True, truncation=True, max_length=128
)
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]
# Start MLflow run
TrackingClient.enable_system_metrics_logging()
with TrackingClient.start_run():
# Set the run name
TrackingClient.set_run_name("YOUR_RUN_NAME")
# Log parameters
TrackingClient.log_param("model_name", model_name)
# Perform sentiment analysis
with torch.no_grad():
outputs = model(input_ids, attention_mask=attention_mask)
# Get the predicted label
predicted_label = torch.argmax(outputs.logits).item()
sentiment = "positive" if predicted_label == 1 else "negative"
TrackingClient.transformers.log_model(sentiment_analysis, "model")
The following MLmodel YAML file is generated:
artifact_path: model
flavors:
python_function:
env:
conda: conda.yaml
virtualenv: python_env.yaml
loader_module: mlflow.transformers
model_binary: model
python_version: 3.9.7
transformers:
code: null
components:
- tokenizer
framework: pt
instance_type: TextClassificationPipeline
model_binary: model
pipeline_model_type: BertForSequenceClassification
source_model_name: bert-base-uncased
task: sentiment-analysis
tokenizer_type: BertTokenizer
transformers_version: 4.29.2
mlflow_version: 2.4.0
model_uuid: 07915c2dafdb4373bc76ce3a2425888e
run_id: 6914f9db67424052bee8e139acf74b70
signature:
inputs: '[{"type": "string"}]'
outputs: '[{"type": "string", "name": "label"}, {"type": "double", "name": "score"}]'
utc_time_created: '2023-11-24 09:13:36.883081'
Pytorch
This example showcases the training of a Convolutional Neural Network (CNN) using PyTorch for image classification on the FashionMNIST dataset. The code includes integration with TrackingClient for efficient experiment tracking.
import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
from ignite.metrics import Accuracy, Loss, ConfusionMatrix, RunningAverage
from ignite.handlers import EarlyStopping
from ignite.contrib.handlers import ProgressBar
from oip_tracking_client.tracking import TrackingClient
# set up TrackingClient
api_host = "<API_HOST>"
api_key = "<API_KEY>"
workspace_name = "WORKSPACE_NAME"
TrackingClient.connect(api_host, api_key, workspace_name)
# set the experiment
experiment_name = "expr001"
TrackingClient.set_experiment(experiment_name)
# Transform to normalize the data
transform = transforms.Compose(
[transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))]
)
# Download and load the training data
trainset = datasets.FashionMNIST(
"./data", download=True, train=True, transform=transform
)
train_loader = DataLoader(trainset, batch_size=64, shuffle=True)
# Download and load the test data
validationset = datasets.FashionMNIST(
"./data", download=True, train=False, transform=transform
)
val_loader = DataLoader(validationset, batch_size=64, shuffle=True)
# CNN model class
class CNN(nn.Module):
def __init__(self):
super(CNN, self).__init__()
self.convlayer1 = nn.Sequential(
nn.Conv2d(1, 32, 3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2),
)
self.convlayer2 = nn.Sequential(
nn.Conv2d(32, 64, 3), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(2)
)
self.fc1 = nn.Linear(64 * 6 * 6, 600)
self.drop = nn.Dropout2d(0.25)
self.fc2 = nn.Linear(600, 120)
self.fc3 = nn.Linear(120, 10)
def forward(self, x):
x = self.convlayer1(x)
x = self.convlayer2(x)
x = x.view(-1, 64 * 6 * 6)
x = self.fc1(x)
x = self.drop(x)
x = self.fc2(x)
x = self.fc3(x)
return F.log_softmax(x, dim=1)
# Creating model, optimizer, and loss
model = CNN()
# Moving model to GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.NLLLoss()
# Defining the number of epochs
epochs = 3
# Creating trainer and evaluator
trainer = create_supervised_trainer(model, optimizer, criterion, device=device)
metrics = {
"accuracy": Accuracy(),
"nll": Loss(criterion),
"cm": ConfusionMatrix(num_classes=10),
}
train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device)
val_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device)
RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
# Early stopping based on validation loss
def score_function(engine):
val_loss = engine.state.metrics["nll"]
return -val_loss
handler = EarlyStopping(patience=10, score_function=score_function, trainer=trainer)
val_evaluator.add_event_handler(Events.COMPLETED, handler)
TrackingClient.enable_system_metrics_logging()
with TrackingClient.start_run():
# Set the run name
TrackingClient.set_run_name("YOUR_RUN_NAME")
pbar = ProgressBar(persist=True, bar_format="")
pbar.attach(trainer, ["loss"])
trainer.run(train_loader, max_epochs=epochs)
# Generate signature using MLOps
x_train_batch, y_train_batch = next(iter(train_loader))
x_train_np = x_train_batch.numpy()
y_train_np = y_train_batch.numpy()
signature = TrackingClient.infer_signature(x_train_np, y_train_np)
TrackingClient.pytorch.log_model(model, "model", signature=signature)
The MLmodel YAML file below is generated upon running the example. It includes essential information about the trained model and its version.
artifact_path: model
flavors:
python_function:
data: data
env:
conda: conda.yaml
virtualenv: python_env.yaml
loader_module: mlflow.pytorch
pickle_module_name: mlflow.pytorch.pickle_module
python_version: 3.8.10
pytorch:
code: null
model_data: data
pytorch_version: 1.13.1+cu117
mlflow_version: 2.6.0
model_uuid: dd70b8ae29354d9091a1a94ffdb9b2a3
run_id: ee063c63cb5b49b89953cefff6a203b9
signature:
inputs: '[{"type": "tensor", "tensor-spec": {"dtype": "float32", "shape": [-1, 1,
28, 28]}}]'
outputs: '[{"type": "tensor", "tensor-spec": {"dtype": "int64", "shape": [-1]}}]'
params: null
utc_time_created: '2023-11-27 14:00:32.746631'
Fastai
This example demonstrates the training of a convolutional neural network (CNN) using the Fastai library and integrates it with TrackingClient for experiment tracking.
from fastai.vision.all import (
URLs,
untar_data,
ImageDataLoaders,
cnn_learner,
resnet18,
accuracy,
)
import numpy as np
from oip_tracking_client.tracking import TrackingClient
# set up TrackingClient
api_host = "<API_HOST>"
api_key = "<API_KEY>"
workspace_name = "WORKSPACE_NAME"
TrackingClient.connect(api_host, api_key, workspace_name)
# set the experiment
experiment_name = "expr001"
TrackingClient.set_experiment(experiment_name)
path = untar_data(URLs.CIFAR)
dls = ImageDataLoaders.from_folder(path, train="train", valid="test")
# Get a batch of validation data
inputs, targets = dls.valid.one_batch()
# Extract the first validation data item
x_valid = inputs.numpy()
_, ch, w, h = x_valid.shape
x_valid = np.zeros(
shape=(1, w, h, ch), dtype=np.uint8
) # Note: input could be any Numpy array supported by PIL.Image.from_array
learn = cnn_learner(dls, resnet18, metrics=accuracy)
TrackingClient.enable_system_metrics_logging()
with TrackingClient.start_run():
# Set the run name
TrackingClient.set_run_name("YOUR_RUN_NAME")
# Log the parameters
TrackingClient.log_params({"epochs": 2, "lr": 1e-3})
# Train the model
learn.fit_one_cycle(1, lr_max=1e-3)
# Log the trained model
y_valid = np.zeros(shape=(1, 10), dtype=np.float32) # Model output is a probability vector of size 10
signature = TrackingClient.infer_signature(x_valid, y_valid)
TrackingClient.fastai.log_model(learn, "model", signature=signature)
The MLmodel YAML file below is generated upon running the example. It includes essential information about the trained model and its version.
artifact_path: model
flavors:
fastai:
code: null
data: model.fastai
fastai_version: 2.7.12
python_function:
data: model.fastai
env:
conda: conda.yaml
virtualenv: python_env.yaml
loader_module: mlflow.fastai
python_version: 3.8.10
mlflow_version: 2.6.0
model_uuid: 8b83ba5aebab4015a88abc0730103367
run_id: dbc2db9f3b85456e88dc9a58be5943bb
signature:
inputs: '[{"type": "tensor", "tensor-spec": {"dtype": "float32", "shape": [-1, 3,
32, 32]}}]'
outputs: '[{"type": "tensor", "tensor-spec": {"dtype": "int64", "shape": [-1]}}]'
params: null
utc_time_created: '2023-11-27 14:38:42.470057'
Tensorflow
This example illustrates training a simple neural network using TensorFlow and integrates it with TrackingClient for efficient experiment tracking.
import tensorflow as tf
from oip_tracking_client.tracking import TrackingClient
# set up TrackingClient
api_host = "<API_HOST>"
api_key = "<API_KEY>"
workspace_name = "WORKSPACE_NAME"
TrackingClient.connect(api_host, api_key, workspace_name)
# set the experiment
experiment_name = "expr001"
TrackingClient.set_experiment(experiment_name)
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
model = tf.keras.models.Sequential(
[
tf.keras.layers.Flatten(input_shape=(28, 28)),
tf.keras.layers.Dense(128, activation="relu"),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(10),
]
)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer="adam", loss=loss_fn, metrics=["accuracy"])
TrackingClient.enable_system_metrics_logging()
with TrackingClient.start_run():
# Set the run name
TrackingClient.set_run_name("YOUR_RUN_NAME")
model.fit(x_train, y_train, epochs=1)
signature = TrackingClient.infer_signature(x_train, y_train)
TrackingClient.tensorflow.log_model(model, "model", signature=signature)
The MLmodel YAML file below is generated upon running the example. It includes essential information about the trained model and its version.
artifact_path: model
flavors:
python_function:
data: data
env:
conda: conda.yaml
virtualenv: python_env.yaml
loader_module: mlflow.tensorflow
python_version: 3.9.7
tensorflow:
code: null
data: data
keras_version: 2.8.0
model_type: keras
save_format: tf
mlflow_version: 2.4.0
model_uuid: 1c5b72f9ae684923a175771562531706
run_id: 80c2a7e836c645c6b9189658732beaed
signature:
inputs: '[{"type": "tensor", "tensor-spec": {"dtype": "float64", "shape": [-1, 28,
28]}}]'
outputs: '[{"type": "tensor", "tensor-spec": {"dtype": "uint8", "shape": [-1]}}]'
utc_time_created: '2023-11-27 15:14:32.143630'
Sentence Transformers
This example demonstrates how to use the Sentence Transformers library for sentence embeddings and integrates it with TrackingClient for experiment tracking.
from sentence_transformers import SentenceTransformer
from oip_tracking_client.tracking import TrackingClient
# set up TrackingClient
api_host = "<API_HOST>"
api_key = "<API_KEY>"
workspace_name = "WORKSPACE_NAME"
TrackingClient.connect(api_host, api_key, workspace_name)
# set the experiment
experiment_name = "expr001"
TrackingClient.set_experiment(experiment_name)
TrackingClient.enable_system_metrics_logging()
with TrackingClient.start_run():
# Set the run name
TrackingClient.set_run_name("YOUR_RUN_NAME")
model = SentenceTransformer("all-MiniLM-L6-v2")
# the signature logged automatically
TrackingClient.sentence_transformers.log_model(model, "model")
The MLmodel YAML file below is generated upon running the example. It includes essential information about the trained model and its version.
artifact_path: model
flavors:
python_function:
data: model.sentence_transformer
env:
conda: conda.yaml
virtualenv: python_env.yaml
loader_module: mlflow.sentence_transformers
python_version: 3.9.7
sentence_transformers:
code: null
sentence_transformers_version: 2.2.2
mlflow_version: 2.4.0
model_uuid: e0eb4cfef2fb496d8c1116a5ac97c067
run_id: 387d039bae5d4dacad46d3d3a6d5baae
signature:
inputs: '[{"type": "string"}]'
outputs: '[{"type": "tensor", "tensor-spec": {"dtype": "float64", "shape": [-1]}}]'
utc_time_created: '2023-11-27 15:29:27.653734'
Spacy
The provided example demonstrates how to train a Spacy named entity recognition (NER) model and log it with TrackingClient.
import random
import spacy
from packaging.version import Version
from spacy.util import compounding, minibatch
from oip_tracking_client.tracking import TrackingClient
# set up TrackingClient
api_host = "<API_HOST>"
api_key = "<API_KEY>"
workspace_name = "WORKSPACE_NAME"
TrackingClient.connect(api_host, api_key, workspace_name)
# set the experiment
experiment_name = "expr001"
TrackingClient.set_experiment(experiment_name)
IS_SPACY_VERSION_NEWER_THAN_OR_EQUAL_TO_3_0_0 = Version(spacy.__version__) >= Version(
"3.0.0"
)
# training data
TRAIN_DATA = [
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
]
# create blank model and add ner to the pipeline
TrackingClient.enable_system_metrics_logging()
with TrackingClient.start_run():
# Set the run name
TrackingClient.set_run_name("YOUR_RUN_NAME")
nlp = spacy.blank("en")
if IS_SPACY_VERSION_NEWER_THAN_OR_EQUAL_TO_3_0_0:
ner = nlp.add_pipe("ner", last=True)
else:
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner, last=True)
# add labels
for _, annotations in TRAIN_DATA:
for ent in annotations.get("entities"):
ner.add_label(ent[2])
params = {"n_iter": 100, "drop": 0.5}
TrackingClient.log_params(params)
nlp.begin_training()
for itn in range(params["n_iter"]):
random.shuffle(TRAIN_DATA)
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(
texts, # batch of texts
annotations, # batch of annotations
drop=params["drop"], # dropout - make it harder to memorise data
losses=losses,
)
print("Losses", losses)
TrackingClient.log_metrics(losses)
# Log the spaCy model using mlflow
TrackingClient.spacy.log_model(spacy_model=nlp, artifact_path="model")
The generated MLmodel YAML file includes crucial information about the trained spaCy NER model and its version.