Examples

Examples of Tracking using popular ML libraries

All the examples are also available on Github.

sklearn

The following example demonstrates how to train a Random Forest Regressor using sklearn and TrackingClient.

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes
from sklearn.ensemble import RandomForestRegressor
from oip_tracking_client.tracking import TrackingClient
from sklearn.metrics import mean_squared_error, r2_score

# set up TrackingClient 
api_host = "<API_HOST>"
api_key = "<API_KEY>"
workspace_name = "WORKSPACE_NAME"
TrackingClient.connect(api_host, api_key, workspace_name)

# set the experiment
experiment_name = "experiment_name"
TrackingClient.set_experiment(experiment_name)

# Load the dataset and split it into training and testing sets.
db = load_diabetes()
X_train, X_test, y_train, y_test = train_test_split(db.data, db.target)

TrackingClient.enable_system_metrics_logging()
with TrackingClient.start_run():
    TrackingClient.autolog()

    # Set the run name
    TrackingClient.set_run_name("YOUR_RUN_NAME")

    # Create and train the RandomForestRegressor model.
    rf = RandomForestRegressor(n_estimators=10, max_depth=6, max_features=3)
    rf.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = rf.predict(X_test)

    # Calculate and log Mean Squared Error (MSE)
    mse = mean_squared_error(y_test, y_pred)
    TrackingClient.log_metric("mse", mse)

    # Calculate and log R-squared
    r2 = r2_score(y_test, y_pred)
    TrackingClient.log_metric("r2", r2)

    signature = TrackingClient.infer_signature(X_train, y_train)
    TrackingClient.sklearn.log_model(rf, "model", signature=signature)

Upon training the model with TrackingClient the following MLmodel YAML file is generated, which stores essential information about the trained model and its version.

artifact_path: model
flavors:
  python_function:
    env:
      conda: conda.yaml
      virtualenv: python_env.yaml
    loader_module: mlflow.sklearn
    model_path: model.pkl
    predict_fn: predict
    python_version: 3.9.7
  sklearn:
    code: null
    pickled_model: model.pkl
    serialization_format: cloudpickle
    sklearn_version: 1.0.2
mlflow_version: 2.4.0
model_uuid: e456b488a05942b58f173cfec10cdc42
run_id: 2fcfd26d59ad41f7aad7e6f487c56dd3
signature:
  inputs: '[{"type": "tensor", "tensor-spec": {"dtype": "float64", "shape": [-1, 10]}}]'
  outputs: '[{"type": "tensor", "tensor-spec": {"dtype": "float64", "shape": [-1]}}]'
utc_time_created: '2023-11-24 09:52:24.567632'

xgboost

The following example demonstrates how to train an XGBoost model using xgboost library and TrackingClient.

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_diabetes
from oip_tracking_client.tracking import TrackingClient


# set up TrackingClient 
api_host = "<API_HOST>"
api_key = "<API_KEY>"
workspace_name = "WORKSPACE_NAME"
TrackingClient.connect(api_host, api_key, workspace_name)

# set the experiment
experiment_name = "experiment_name"
TrackingClient.set_experiment(experiment_name)


# Load the dataset and split it into training and testing sets.
db = load_diabetes()
X_train, X_test, y_train, y_test = train_test_split(db.data, db.target)

TrackingClient.enable_system_metrics_logging()
with TrackingClient.start_run():
    TrackingClient.autolog()

    # Set the run name
    TrackingClient.set_run_name("YOUR_RUN_NAME")

    # Create and train the XGBoost model.
    params = {
        "n_estimators": 100,
        "max_depth": 6,
        "learning_rate": 0.1,
        "objective": "reg:squarederror",
    }
    xgb_model = xgb.XGBRegressor(**params)
    xgb_model.fit(X_train, y_train)

    signature = TrackingClient.infer_signature(X_train, y_train)
    TrackingClient.xgboost.log_model(xgb_model, "model", signature=signature)

After training the XGBoost model with TrackingClient, the following MLmodel YAML file is generated:

artifact_path: model
flavors:
  python_function:
    data: model.xgb
    env:
      conda: conda.yaml
      virtualenv: python_env.yaml
    loader_module: mlflow.xgboost
    python_version: 3.9.7
  xgboost:
    code: null
    data: model.xgb
    model_class: xgboost.sklearn.XGBRegressor
    model_format: xgb
    xgb_version: 1.6.2
mlflow_version: 2.4.0
model_uuid: 1b41fa0786e34aedb2db835a43a71a05
run_id: 56159271f40f4351976391ff0bdadd77
signature:
  inputs: '[{"type": "tensor", "tensor-spec": {"dtype": "float64", "shape": [-1, 10]}}]'
  outputs: '[{"type": "tensor", "tensor-spec": {"dtype": "float64", "shape": [-1]}}]'
utc_time_created: '2023-11-24 10:24:57.517327'

statsmodels

The following example demonstrates how to train a SARIMAX model using statsmodels and TrackingClient.

import numpy as np
import statsmodels.api as sm
from oip_tracking_client.tracking import TrackingClient


# set up TrackingClient 
api_host = "<API_HOST>"
api_key = "<API_KEY>"
workspace_name = "WORKSPACE_NAME"
TrackingClient.connect(api_host, api_key, workspace_name)

# set the experiment
experiment_name = "experiment_name"
TrackingClient.set_experiment(experiment_name)

# Generate sample data with shape (-1, 10) for 10 features and a single output
np.random.seed(42)
num_samples = 100
num_features = 10

X = np.random.rand(num_samples, num_features)
# Generating the response variable as a linear combination of the features with some noise
true_coefficients = np.random.rand(num_features)
noise = np.random.normal(loc=0, scale=0.1, size=num_samples)
y = np.dot(X, true_coefficients) + noise

# Add a constant term to the independent variable (intercept)
X = sm.add_constant(X)

TrackingClient.enable_system_metrics_logging()
with TrackingClient.start_run():
    TrackingClient.autolog()

    # Set the run name
    TrackingClient.set_run_name("YOUR_RUN_NAME")

    # Create a linear regression model
    model = sm.OLS(y, X)

    # Fit the model to the data
    model_fit = model.fit()

    # Log the model using mlflow.statsmodels.log_model
    signature = TrackingClient.infer_signature(X, y)
    TrackingClient.statsmodels.log_model(model_fit, "model", signature=signature)

After training the SARIMAX model with TrackingClient, the following MLmodel YAML file is generated:

artifact_path: model
flavors:
  python_function:
    data: model.statsmodels
    env:
      conda: conda.yaml
      virtualenv: python_env.yaml
    loader_module: mlflow.statsmodels
    python_version: 3.9.7
  statsmodels:
    code: null
    data: model.statsmodels
    statsmodels_version: 0.13.2
mlflow_version: 2.4.0
model_uuid: 2e70fe54dc224704ac0870af09487bbc
run_id: 3d0b08ff27314c759400580083e98c4f
signature:
  inputs: '[{"type": "tensor", "tensor-spec": {"dtype": "float64", "shape": [-1, 11]}}]'
  outputs: '[{"type": "tensor", "tensor-spec": {"dtype": "float64", "shape": [-1]}}]'
utc_time_created: '2023-11-24 10:22:09.899267'

keras

Keras is a high-level neural networks API, written in Python, that allows for easy and fast prototyping and experimentation. You can use Keras in conjunction with TrackingClient to track and manage experiments for model training and inference.

import numpy as np
from tensorflow import keras
from tensorflow.keras.datasets import reuters
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.preprocessing.text import Tokenizer
from oip_tracking_client.tracking import TrackingClient


# set up TrackingClient 
api_host = "<API_HOST>"
api_key = "<API_KEY>"
workspace_name = "WORKSPACE_NAME"
TrackingClient.connect(api_host, api_key, workspace_name)

# set the experiment
experiment_name = "expr001"
TrackingClient.set_experiment(experiment_name)


max_words = 50
batch_size = 32
epochs = 4

# Loading data
(x_train, y_train), (x_test, y_test) = reuters.load_data(
    num_words=max_words, test_split=0.2
)

num_classes = np.max(y_train) + 1

# Vectorizing sequence data
tokenizer = Tokenizer(num_words=max_words)
x_train = tokenizer.sequences_to_matrix(x_train, mode="binary")
x_test = tokenizer.sequences_to_matrix(x_test, mode="binary")

# Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

TrackingClient.enable_system_metrics_logging()
with TrackingClient.start_run():
    TrackingClient.tensorflow.autolog()

    # Set the run name
    TrackingClient.set_run_name("YOUR_RUN_NAME")

    # Building model
    model = Sequential()
    model.add(Dense(512, input_shape=(max_words,)))
    model.add(Activation("relu"))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes))
    model.add(Activation("softmax"))

    model.compile(
        loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
    )

    model.fit(
        x_train,
        y_train,
        batch_size=batch_size,
        epochs=epochs,
        verbose=1,
        validation_split=0.1,
    )

    # Log the model using mlflow.statsmodels.log_model
    signature = TrackingClient.infer_signature(x_train, y_train)
    TrackingClient.keras.log_model(model, "model", signature=signature)

After running the above code, TrackingClient will automatically log all relevant metrics, parameters, and artifacts (such as the trained model) during the training process.

After training the Keras model with TrackingClient, the following MLmodel YAML file is generated:

artifact_path: model
flavors:
  python_function:
    data: data
    env:
      conda: conda.yaml
      virtualenv: python_env.yaml
    loader_module: mlflow.tensorflow
    python_version: 3.9.7
  tensorflow:
    code: null
    data: data
    keras_version: 2.8.0
    model_type: keras
    save_format: tf
mlflow_version: 2.4.0
model_uuid: 82aa11563c54406bb1bafc76feafac5a
run_id: 56ccd94b83fa4d83865f0b8cb3644131
signature:
  inputs: '[{"type": "tensor", "tensor-spec": {"dtype": "float64", "shape": [-1, 50]}}]'
  outputs: '[{"type": "tensor", "tensor-spec": {"dtype": "float32", "shape": [-1,
    46]}}]'
utc_time_created: '2023-11-24 10:42:44.701726'

Transformers

This section demonstrates how to perform sentiment analysis using the transformers library and log the model with TrackingClient. We'll use a pre-trained BERT model for sentiment analysis.

import torch
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
from oip_tracking_client.tracking import TrackingClient

# set up TrackingClient 
api_host = "<API_HOST>"
api_key = "<API_KEY>"
workspace_name = "WORKSPACE_NAME"
TrackingClient.connect(api_host, api_key, workspace_name)

# set the experiment
experiment_name = "expr001"
TrackingClient.set_experiment(experiment_name)


# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

sentiment_analysis = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Define a sample movie review
review = "I really enjoyed watching this movie."

# Tokenize the review and convert to tensor
inputs = tokenizer(
    review, return_tensors="pt", padding=True, truncation=True, max_length=128
)

input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]


# Start MLflow run
TrackingClient.enable_system_metrics_logging()
with TrackingClient.start_run():

    # Set the run name
    TrackingClient.set_run_name("YOUR_RUN_NAME")

    # Log parameters
    TrackingClient.log_param("model_name", model_name)

    # Perform sentiment analysis
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    # Get the predicted label
    predicted_label = torch.argmax(outputs.logits).item()
    sentiment = "positive" if predicted_label == 1 else "negative"

    TrackingClient.transformers.log_model(sentiment_analysis, "model")

The following MLmodel YAML file is generated:

artifact_path: model
flavors:
  python_function:
    env:
      conda: conda.yaml
      virtualenv: python_env.yaml
    loader_module: mlflow.transformers
    model_binary: model
    python_version: 3.9.7
  transformers:
    code: null
    components:
    - tokenizer
    framework: pt
    instance_type: TextClassificationPipeline
    model_binary: model
    pipeline_model_type: BertForSequenceClassification
    source_model_name: bert-base-uncased
    task: sentiment-analysis
    tokenizer_type: BertTokenizer
    transformers_version: 4.29.2
mlflow_version: 2.4.0
model_uuid: 07915c2dafdb4373bc76ce3a2425888e
run_id: 6914f9db67424052bee8e139acf74b70
signature:
  inputs: '[{"type": "string"}]'
  outputs: '[{"type": "string", "name": "label"}, {"type": "double", "name": "score"}]'
utc_time_created: '2023-11-24 09:13:36.883081'

Pytorch

This example showcases the training of a Convolutional Neural Network (CNN) using PyTorch for image classification on the FashionMNIST dataset. The code includes integration with TrackingClient for efficient experiment tracking.

import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from ignite.engine import Events, create_supervised_trainer, create_supervised_evaluator
from ignite.metrics import Accuracy, Loss, ConfusionMatrix, RunningAverage
from ignite.handlers import EarlyStopping
from ignite.contrib.handlers import ProgressBar
from oip_tracking_client.tracking import TrackingClient

# set up TrackingClient 
api_host = "<API_HOST>"
api_key = "<API_KEY>"
workspace_name = "WORKSPACE_NAME"
TrackingClient.connect(api_host, api_key, workspace_name)

# set the experiment
experiment_name = "expr001"
TrackingClient.set_experiment(experiment_name)

# Transform to normalize the data
transform = transforms.Compose(
    [transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))]
)

# Download and load the training data
trainset = datasets.FashionMNIST(
    "./data", download=True, train=True, transform=transform
)
train_loader = DataLoader(trainset, batch_size=64, shuffle=True)

# Download and load the test data
validationset = datasets.FashionMNIST(
    "./data", download=True, train=False, transform=transform
)
val_loader = DataLoader(validationset, batch_size=64, shuffle=True)


# CNN model class
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.convlayer1 = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.convlayer2 = nn.Sequential(
            nn.Conv2d(32, 64, 3), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(2)
        )
        self.fc1 = nn.Linear(64 * 6 * 6, 600)
        self.drop = nn.Dropout2d(0.25)
        self.fc2 = nn.Linear(600, 120)
        self.fc3 = nn.Linear(120, 10)

    def forward(self, x):
        x = self.convlayer1(x)
        x = self.convlayer2(x)
        x = x.view(-1, 64 * 6 * 6)
        x = self.fc1(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.fc3(x)
        return F.log_softmax(x, dim=1)


# Creating model, optimizer, and loss
model = CNN()

# Moving model to GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.NLLLoss()

# Defining the number of epochs
epochs = 3

# Creating trainer and evaluator
trainer = create_supervised_trainer(model, optimizer, criterion, device=device)
metrics = {
    "accuracy": Accuracy(),
    "nll": Loss(criterion),
    "cm": ConfusionMatrix(num_classes=10),
}
train_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device)
val_evaluator = create_supervised_evaluator(model, metrics=metrics, device=device)

RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")


# Early stopping based on validation loss
def score_function(engine):
    val_loss = engine.state.metrics["nll"]
    return -val_loss


handler = EarlyStopping(patience=10, score_function=score_function, trainer=trainer)
val_evaluator.add_event_handler(Events.COMPLETED, handler)

TrackingClient.enable_system_metrics_logging()
with TrackingClient.start_run():

    # Set the run name
    TrackingClient.set_run_name("YOUR_RUN_NAME")

    pbar = ProgressBar(persist=True, bar_format="")
    pbar.attach(trainer, ["loss"])
    trainer.run(train_loader, max_epochs=epochs)

    # Generate signature using MLOps
    x_train_batch, y_train_batch = next(iter(train_loader))
    x_train_np = x_train_batch.numpy()
    y_train_np = y_train_batch.numpy()

    signature = TrackingClient.infer_signature(x_train_np, y_train_np)
    TrackingClient.pytorch.log_model(model, "model", signature=signature)

The MLmodel YAML file below is generated upon running the example. It includes essential information about the trained model and its version.

artifact_path: model
flavors:
  python_function:
    data: data
    env:
      conda: conda.yaml
      virtualenv: python_env.yaml
    loader_module: mlflow.pytorch
    pickle_module_name: mlflow.pytorch.pickle_module
    python_version: 3.8.10
  pytorch:
    code: null
    model_data: data
    pytorch_version: 1.13.1+cu117
mlflow_version: 2.6.0
model_uuid: dd70b8ae29354d9091a1a94ffdb9b2a3
run_id: ee063c63cb5b49b89953cefff6a203b9
signature:
  inputs: '[{"type": "tensor", "tensor-spec": {"dtype": "float32", "shape": [-1, 1,
    28, 28]}}]'
  outputs: '[{"type": "tensor", "tensor-spec": {"dtype": "int64", "shape": [-1]}}]'
  params: null
utc_time_created: '2023-11-27 14:00:32.746631'

Fastai

This example demonstrates the training of a convolutional neural network (CNN) using the Fastai library and integrates it with TrackingClient for experiment tracking.

from fastai.vision.all import (
    URLs,
    untar_data,
    ImageDataLoaders,
    cnn_learner,
    resnet18,
    accuracy,
)
import numpy as np
from oip_tracking_client.tracking import TrackingClient

# set up TrackingClient 
api_host = "<API_HOST>"
api_key = "<API_KEY>"
workspace_name = "WORKSPACE_NAME"
TrackingClient.connect(api_host, api_key, workspace_name)

# set the experiment
experiment_name = "expr001"
TrackingClient.set_experiment(experiment_name)


path = untar_data(URLs.CIFAR)
dls = ImageDataLoaders.from_folder(path, train="train", valid="test")
# Get a batch of validation data
inputs, targets = dls.valid.one_batch()

# Extract the first validation data item
x_valid = inputs.numpy()
_, ch, w, h = x_valid.shape
x_valid = np.zeros(
    shape=(1, w, h, ch), dtype=np.uint8
)  # Note: input could be any Numpy array supported by PIL.Image.from_array

learn = cnn_learner(dls, resnet18, metrics=accuracy)


TrackingClient.enable_system_metrics_logging()
with TrackingClient.start_run():

    # Set the run name
    TrackingClient.set_run_name("YOUR_RUN_NAME")

    # Log the parameters
    TrackingClient.log_params({"epochs": 2, "lr": 1e-3})

    # Train the model
    learn.fit_one_cycle(1, lr_max=1e-3)

    # Log the trained model
    y_valid = np.zeros(shape=(1, 10), dtype=np.float32)  # Model output is a probability vector of size 10
    signature = TrackingClient.infer_signature(x_valid, y_valid)
    TrackingClient.fastai.log_model(learn, "model", signature=signature)

The MLmodel YAML file below is generated upon running the example. It includes essential information about the trained model and its version.

artifact_path: model
flavors:
  fastai:
    code: null
    data: model.fastai
    fastai_version: 2.7.12
  python_function:
    data: model.fastai
    env:
      conda: conda.yaml
      virtualenv: python_env.yaml
    loader_module: mlflow.fastai
    python_version: 3.8.10
mlflow_version: 2.6.0
model_uuid: 8b83ba5aebab4015a88abc0730103367
run_id: dbc2db9f3b85456e88dc9a58be5943bb
signature:
  inputs: '[{"type": "tensor", "tensor-spec": {"dtype": "float32", "shape": [-1, 3,
    32, 32]}}]'
  outputs: '[{"type": "tensor", "tensor-spec": {"dtype": "int64", "shape": [-1]}}]'
  params: null
utc_time_created: '2023-11-27 14:38:42.470057'

Tensorflow

This example illustrates training a simple neural network using TensorFlow and integrates it with TrackingClient for efficient experiment tracking.

import tensorflow as tf
from oip_tracking_client.tracking import TrackingClient

# set up TrackingClient 
api_host = "<API_HOST>"
api_key = "<API_KEY>"
workspace_name = "WORKSPACE_NAME"
TrackingClient.connect(api_host, api_key, workspace_name)

# set the experiment
experiment_name = "expr001"
TrackingClient.set_experiment(experiment_name)

mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

model = tf.keras.models.Sequential(
    [
        tf.keras.layers.Flatten(input_shape=(28, 28)),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(10),
    ]
)

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(optimizer="adam", loss=loss_fn, metrics=["accuracy"])

TrackingClient.enable_system_metrics_logging()
with TrackingClient.start_run():

    # Set the run name
    TrackingClient.set_run_name("YOUR_RUN_NAME")

    model.fit(x_train, y_train, epochs=1)

    signature = TrackingClient.infer_signature(x_train, y_train)
    TrackingClient.tensorflow.log_model(model, "model", signature=signature)

The MLmodel YAML file below is generated upon running the example. It includes essential information about the trained model and its version.

artifact_path: model
flavors:
  python_function:
    data: data
    env:
      conda: conda.yaml
      virtualenv: python_env.yaml
    loader_module: mlflow.tensorflow
    python_version: 3.9.7
  tensorflow:
    code: null
    data: data
    keras_version: 2.8.0
    model_type: keras
    save_format: tf
mlflow_version: 2.4.0
model_uuid: 1c5b72f9ae684923a175771562531706
run_id: 80c2a7e836c645c6b9189658732beaed
signature:
  inputs: '[{"type": "tensor", "tensor-spec": {"dtype": "float64", "shape": [-1, 28,
    28]}}]'
  outputs: '[{"type": "tensor", "tensor-spec": {"dtype": "uint8", "shape": [-1]}}]'
utc_time_created: '2023-11-27 15:14:32.143630'

Sentence Transformers

This example demonstrates how to use the Sentence Transformers library for sentence embeddings and integrates it with TrackingClient for experiment tracking.

from sentence_transformers import SentenceTransformer
from oip_tracking_client.tracking import TrackingClient

# set up TrackingClient 
api_host = "<API_HOST>"
api_key = "<API_KEY>"
workspace_name = "WORKSPACE_NAME"
TrackingClient.connect(api_host, api_key, workspace_name)

# set the experiment
experiment_name = "expr001"
TrackingClient.set_experiment(experiment_name)

TrackingClient.enable_system_metrics_logging()
with TrackingClient.start_run():

    # Set the run name
    TrackingClient.set_run_name("YOUR_RUN_NAME")

    model = SentenceTransformer("all-MiniLM-L6-v2")
    # the signature logged automatically
    TrackingClient.sentence_transformers.log_model(model, "model")

The MLmodel YAML file below is generated upon running the example. It includes essential information about the trained model and its version.

artifact_path: model
flavors:
  python_function:
    data: model.sentence_transformer
    env:
      conda: conda.yaml
      virtualenv: python_env.yaml
    loader_module: mlflow.sentence_transformers
    python_version: 3.9.7
  sentence_transformers:
    code: null
    sentence_transformers_version: 2.2.2
mlflow_version: 2.4.0
model_uuid: e0eb4cfef2fb496d8c1116a5ac97c067
run_id: 387d039bae5d4dacad46d3d3a6d5baae
signature:
  inputs: '[{"type": "string"}]'
  outputs: '[{"type": "tensor", "tensor-spec": {"dtype": "float64", "shape": [-1]}}]'
utc_time_created: '2023-11-27 15:29:27.653734'

Spacy

The provided example demonstrates how to train a Spacy named entity recognition (NER) model and log it with TrackingClient.

import random
import spacy
from packaging.version import Version
from spacy.util import compounding, minibatch
from oip_tracking_client.tracking import TrackingClient

# set up TrackingClient 
api_host = "<API_HOST>"
api_key = "<API_KEY>"
workspace_name = "WORKSPACE_NAME"
TrackingClient.connect(api_host, api_key, workspace_name)

# set the experiment
experiment_name = "expr001"
TrackingClient.set_experiment(experiment_name)

IS_SPACY_VERSION_NEWER_THAN_OR_EQUAL_TO_3_0_0 = Version(spacy.__version__) >= Version(
    "3.0.0"
)

# training data
TRAIN_DATA = [
    ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
    ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
]


# create blank model and add ner to the pipeline
TrackingClient.enable_system_metrics_logging()
with TrackingClient.start_run():

    # Set the run name
    TrackingClient.set_run_name("YOUR_RUN_NAME")

    nlp = spacy.blank("en")
    if IS_SPACY_VERSION_NEWER_THAN_OR_EQUAL_TO_3_0_0:
        ner = nlp.add_pipe("ner", last=True)
    else:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    params = {"n_iter": 100, "drop": 0.5}
    TrackingClient.log_params(params)

    nlp.begin_training()
    for itn in range(params["n_iter"]):
        random.shuffle(TRAIN_DATA)
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(
                texts,  # batch of texts
                annotations,  # batch of annotations
                drop=params["drop"],  # dropout - make it harder to memorise data
                losses=losses,
            )
        print("Losses", losses)
        TrackingClient.log_metrics(losses)
    # Log the spaCy model using mlflow
    TrackingClient.spacy.log_model(spacy_model=nlp, artifact_path="model")

The generated MLmodel YAML file includes crucial information about the trained spaCy NER model and its version.

artifact_path: model
flavors:
  spacy:
    code: null
    data: model.spacy
    spacy_version: 2.2.3
mlflow_version: 2.4.0
model_uuid: b5c9444b3722449e845d93d07289f3a0
run_id: 5271f94120094896b67a3877644d1f4a
utc_time_created: '2023-11-28 08:50:57.284239'