Chocolate Preference Identifier (Quantum SVC)¶

A quantum variant of chocolate category prediction:

  • ZZFeatureMap (reps=2, linear entanglement),
  • FidelityQuantumKernel pÅ™es AerSimulator (statevector),
  • QSVC z qiskit_machine_learning.

Dataset is here: "Chocolate Sales Dataset 2023 - 2024"

Link: kaggle

In [1]:
import pandas as pd
import numpy as np
import dill
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from qiskit.circuit.library import ZZFeatureMap
from qiskit_machine_learning.kernels import FidelityQuantumKernel
from qiskit_machine_learning.algorithms import QSVC
from qiskit_algorithms.utils import algorithm_globals
from qiskit_algorithms.state_fidelities import ComputeUncompute
from qiskit_aer import AerSimulator
from qiskit.primitives import BackendSamplerV2
from qiskit.transpiler.preset_passmanagers import generate_preset_pass_manager
import warnings
import time
warnings.filterwarnings('ignore')

algorithm_globals.random_seed = 42
SAMPLE_SIZE = 250

sales     = pd.read_csv('sales.csv')
customers = pd.read_csv('customers.csv')
products  = pd.read_csv('products.csv')
stores    = pd.read_csv('stores.csv')

df = (sales
      .merge(customers, on='customer_id', how='left')
      .merge(products,  on='product_id',  how='left')
      .merge(stores,    on='store_id',    how='left'))

df['age']            = df['age'].fillna(df['age'].median())
df['gender']         = df['gender'].fillna('Unknown')
df['loyalty_member'] = df['loyalty_member'].fillna(0).astype(int)
df = df.dropna(subset=['category', 'brand', 'cocoa_percent', 'weight_g'])

le_gender     = LabelEncoder()
le_country    = LabelEncoder()
le_store_type = LabelEncoder()
le_category   = LabelEncoder()

df['gender_encoded']     = le_gender.fit_transform(df['gender'].astype(str))
df['country_encoded']    = le_country.fit_transform(df['country'].astype(str))
df['store_type_encoded'] = le_store_type.fit_transform(df['store_type'].astype(str))
df['category_id']        = le_category.fit_transform(df['category'].astype(str))

df['avg_spend']   = df['revenue'] / df['quantity']
df['is_premium']  = (df['unit_price'] >= df['unit_price'].median()).astype(int)
df['is_discount'] = (df['discount'] > 0).astype(int)

demographic_features = ['age', 'gender_encoded', 'country_encoded',
                        'loyalty_member', 'store_type_encoded']
behavior_features    = ['quantity', 'unit_price', 'discount',
                        'avg_spend', 'is_premium', 'is_discount']

df_sample = df.head(SAMPLE_SIZE).copy()
y = df_sample['category_id'].values

backend = AerSimulator(
    method='statevector',
    device='CPU',
    max_parallel_threads=4,
    max_parallel_experiments=1,
    precision='double',
    max_memory_mb=8192,
    fusion_enable=True,
    fusion_verbose=False,
    fusion_max_qubit=5,
    fusion_threshold=14)

pass_manager = generate_preset_pass_manager(
    backend=backend,
    optimization_level=2,
    seed_transpiler=42)

sampler = BackendSamplerV2(
    backend=backend,
    options={
        "default_shots": 1024,
        "seed_simulator": 42
    })

X_demo = df_sample[demographic_features].values

scaler_demo = StandardScaler()
X_demo_scaled = scaler_demo.fit_transform(X_demo)

X_train, X_test, y_train, y_test = train_test_split(
    X_demo_scaled, y, test_size=0.2, random_state=42
)

num_features = len(demographic_features)
feature_map = ZZFeatureMap(
    feature_dimension=num_features,
    reps=2,
    entanglement='linear')

print(f"  - Features: {num_features}")
print(f"  - Qubits: {feature_map.num_qubits}")
print(f"  - Gates: {feature_map.count_ops()}")

fidelity = ComputeUncompute(
    sampler=sampler,
    transpiler=pass_manager)

quantum_kernel = FidelityQuantumKernel(
    feature_map=feature_map,
    fidelity=fidelity)

qsvc_demo = QSVC(quantum_kernel=quantum_kernel)

start_time = time.time()

qsvc_demo.fit(X_demo_scaled, y)

train_time = time.time() - start_time
train_acc = qsvc_demo.score(X_demo_scaled, y)

print(f"  - Accuracy: {train_acc:.4f} ({train_acc*100:.2f}%)")
print(f"  - Training time: {train_time:.2f}s")
print(f"  - Support vectors: {qsvc_demo.support_vectors_.shape[0]}")

pkg_demo = {
    'model': qsvc_demo,
    'scaler': scaler_demo,
    'features': demographic_features,
    'le_category': le_category,
    'le_gender': le_gender,
    'le_country': le_country,
    'le_store_type': le_store_type,
    'dataset': df_sample[['customer_id', 'age', 'gender', 'country',
                          'loyalty_member', 'store_type', 'category']],
    'type': 'demographic_quantum_aer_transpiled',
    'feature_map': feature_map,
    'backend_name': backend.name,
    'optimization_level': 2,
    'training_time': train_time,
    'accuracy': train_acc
}

with open('chocolate_identifier_demographic_qsvc.pkl', 'wb') as f:
    dill.dump(pkg_demo, f)

X_beh = df_sample[behavior_features].values

scaler_beh = StandardScaler()
X_beh_scaled = scaler_beh.fit_transform(X_beh)

num_features_beh = len(behavior_features)
feature_map_beh = ZZFeatureMap(
    feature_dimension=num_features_beh,
    reps=2,
    entanglement='linear')

print(f"  - Features: {num_features_beh}")
print(f"  - Qubits: {feature_map_beh.num_qubits}")
print(f"  - Gates: {feature_map_beh.count_ops()}")

backend_beh = AerSimulator(
    method='statevector',
    device='CPU',
    max_parallel_threads=4,
    fusion_enable=True)

pass_manager_beh = generate_preset_pass_manager(
    backend=backend_beh,
    optimization_level=3,
    seed_transpiler=42)

sampler_beh = BackendSamplerV2(
    backend=backend_beh,
    options={
        "default_shots": 2048,
        "seed_simulator": 42
    })

fidelity_beh = ComputeUncompute(
    sampler=sampler_beh,
    transpiler=pass_manager_beh)

quantum_kernel_beh = FidelityQuantumKernel(
    feature_map=feature_map_beh,
    fidelity=fidelity_beh)

qsvc_beh = QSVC(quantum_kernel=quantum_kernel_beh)
start_time_beh = time.time()
qsvc_beh.fit(X_beh_scaled, y)

train_time_beh = time.time() - start_time_beh
train_acc_beh = qsvc_beh.score(X_beh_scaled, y)

print(f"  - Accuracy: {train_acc_beh:.4f} ({train_acc_beh*100:.2f}%)")
print(f"  - Training time: {train_time_beh:.2f}s")
print(f"  - Support vectors: {qsvc_beh.support_vectors_.shape[0]}")

pkg_beh = {
    'model': qsvc_beh,
    'scaler': scaler_beh,
    'features': behavior_features,
    'le_category': le_category,
    'dataset': df_sample[['customer_id'] + behavior_features + ['category']],
    'type': 'behavior_quantum_aer_transpiled',
    'feature_map': feature_map_beh,
    'backend_name': backend_beh.name,
    'optimization_level': 3,
    'training_time': train_time_beh,
    'accuracy': train_acc_beh
}

with open('chocolate_identifier_behavior_qsvc.pkl', 'wb') as f:
    dill.dump(pkg_beh, f)
  - Features: 5
  - Qubits: 5
  - Gates: OrderedDict({'ZZFeatureMap': 1})
  - Accuracy: 0.6880 (68.80%)
  - Training time: 187.33s
  - Support vectors: 0
  - Features: 6
  - Qubits: 6
  - Gates: OrderedDict({'ZZFeatureMap': 1})
  - Accuracy: 0.7480 (74.80%)
  - Training time: 336.65s
  - Support vectors: 0
In [2]:
def predict_top_k_svm(model, scaler, X, le_category, k=5):
    X_scaled = scaler.transform(X)
    decision_scores = model.decision_function(X_scaled)

    if decision_scores.ndim == 1:
        decision_scores = decision_scores.reshape(-1, 1)

    results = []
    for i in range(len(X)):
        scores = decision_scores[i]
        n = len(scores)
        k_eff = min(k, n)
        top_k_indices = np.argsort(scores)[::-1][:k_eff]
        top_k_scores = scores[top_k_indices]

        exp_scores = np.exp(top_k_scores - np.max(top_k_scores))
        confidences = 100 * exp_scores / exp_scores.sum()

        category_names = le_category.inverse_transform(top_k_indices)
        results.append(list(zip(category_names, confidences)))

    return results
In [3]:
print("\nTEST 1: 35y woman, France, loyalty yes, Mall")
g_enc = le_gender.transform(['Female'])[0]
c_enc = le_country.transform(['France'])[0]
s_enc = le_store_type.transform(['Mall'])[0]

X_test = np.array([[35, g_enc, c_enc, 1, s_enc]])
top_k = predict_top_k_svm(qsvc_demo, scaler_demo, X_test, le_category, k=5)

print("\n  Top 5:")
for i, (name, conf) in enumerate(top_k[0], 1):
    print(f"    {i}. {name:10s} (confidence: {conf:.1f}%)")
TEST 1: 35y woman, France, loyalty yes, Mall

  Top 5:
    1. Praline    (confidence: 70.6%)
    2. Milk       (confidence: 19.3%)
    3. Truffle    (confidence: 6.8%)
    4. Dark       (confidence: 2.5%)
    5. White      (confidence: 0.8%)
In [4]:
print("\nTEST 2: 22y man, USA, loyalty no, Online")
g_enc = le_gender.transform(['Male'])[0]
c_enc = le_country.transform(['USA'])[0]
s_enc = le_store_type.transform(['Online'])[0]

X_test = np.array([[22, g_enc, c_enc, 0, s_enc]])
top_k = predict_top_k_svm(qsvc_demo, scaler_demo, X_test, le_category, k=5)

print("\n  Top 5:")
for i, (name, conf) in enumerate(top_k[0], 1):
    print(f"    {i}. {name:10s} (confidence: {conf:.1f}%)")
TEST 2: 22y man, USA, loyalty no, Online

  Top 5:
    1. Praline    (confidence: 67.0%)
    2. Truffle    (confidence: 23.1%)
    3. Milk       (confidence: 7.2%)
    4. Dark       (confidence: 2.0%)
    5. White      (confidence: 0.7%)
In [5]:
print("\nTEST 3: Identification by behavior (3 items @ 18.50, discount 0.10)")
qty, price, disc = 3, 18.50, 0.10
avg_spend = price * (1 - disc)
is_prem = 1 if price >= 12 else 0
is_disc = 1 if disc > 0 else 0

X_test = np.array([[qty, price, disc, avg_spend, is_prem, is_disc]])
top_k = predict_top_k_svm(qsvc_beh, scaler_beh, X_test, le_category, k=5)

print("\n  Top 5:")
for i, (name, conf) in enumerate(top_k[0], 1):
    print(f"    {i}. {name:10s} (confidence: {conf:.1f}%)")
TEST 3: Identification by behavior (3 items @ 18.50, discount 0.10)

  Top 5:
    1. Praline    (confidence: 67.1%)
    2. Dark       (confidence: 22.6%)
    3. Milk       (confidence: 7.0%)
    4. White      (confidence: 2.4%)
    5. Truffle    (confidence: 0.9%)
In [6]:
print("\nTEST 4: 60y woman, UK, loyalty yes, Retail")
g_enc = le_gender.transform(['Female'])[0]
c_enc = le_country.transform(['UK'])[0]
s_enc = le_store_type.transform(['Retail'])[0]

X_test = np.array([[60, g_enc, c_enc, 1, s_enc]])
top_k = predict_top_k_svm(qsvc_demo, scaler_demo, X_test, le_category, k=5)

print("\n  Top 5:")
for i, (name, conf) in enumerate(top_k[0], 1):
    print(f"    {i}. {name:10s} (confidence: {conf:.1f}%)")
TEST 4: 60y woman, UK, loyalty yes, Retail

  Top 5:
    1. Praline    (confidence: 67.0%)
    2. Dark       (confidence: 22.6%)
    3. White      (confidence: 7.5%)
    4. Milk       (confidence: 2.2%)
    5. Truffle    (confidence: 0.7%)
In [7]:
#You can use this cell on its own for prediction

import dill
import numpy as np
import pandas as pd


class ChocolatePredictor:

    def __init__(self, model_path):
        self.model_path = model_path
        self._load_model()

    def _load_model(self):
        with open(self.model_path, 'rb') as f:
            pkg = dill.load(f)

        self.model = pkg['model']
        self.scaler = pkg['scaler']
        self.le_category = pkg['le_category']
        self.le_gender = pkg.get('le_gender')
        self.le_country = pkg.get('le_country')
        self.le_store_type = pkg.get('le_store_type')
        self.features = pkg['features']
        self.dataset = pkg.get('dataset')

        self.train_accuracy = pkg.get('accuracy')
        self.training_time = pkg.get('training_time')

        if self.train_accuracy:
            print(f"  - Train accuracy: {self.train_accuracy*100:.2f}%")

    def _prepare_features(self, customer_data):
        if isinstance(customer_data, dict):
            age      = customer_data['age']
            gender   = customer_data['gender']
            country  = customer_data['country']
            loyalty  = customer_data.get('loyalty_member', 0)
            store_t  = customer_data.get('store_type', 'Retail')

            g_enc = self.le_gender.transform([gender])[0]
            c_enc = self.le_country.transform([country])[0]
            s_enc = self.le_store_type.transform([store_t])[0]

            return np.array([[age, g_enc, c_enc, loyalty, s_enc]])

        elif isinstance(customer_data, pd.DataFrame):
            return customer_data[self.features].values

        else:
            return customer_data

    def predict_top_k(self, customer_data, k=5, show_scores=False):
        """
        Args:
            customer_data: dict, DataFrame or array
            k: number of predictions
            show_scores: print decision scores

        Returns:
            list: [(category_name, confidence), ...]
        """
        X = self._prepare_features(customer_data)
        X_scaled = self.scaler.transform(X)

        decision_scores = self.model.decision_function(X_scaled)
        if decision_scores.ndim == 1:
            decision_scores = decision_scores.reshape(-1, 1)

        if show_scores:
            print(f"Decision scores shape: {decision_scores.shape}")

        results = []
        for i in range(len(X)):
            scores = decision_scores[i]
            k_eff = min(k, len(scores))
            top_k_indices = np.argsort(scores)[::-1][:k_eff]
            top_k_scores = scores[top_k_indices]

            exp_scores = np.exp(top_k_scores - np.max(top_k_scores))
            confidences = 100 * exp_scores / exp_scores.sum()

            category_names = self.le_category.inverse_transform(top_k_indices)
            results.append(list(zip(category_names, confidences)))

        return results[0] if len(results) == 1 else results

    def predict_single(self, age, gender, country, loyalty_member=0,
                       store_type='Retail', k=5):
        """
        Args:
            age: age
            gender: 'Male' / 'Female'
            country: country
            loyalty_member: 0/1
            store_type: 'Retail' / 'Mall' / 'Airport' / 'Online'
            k: number of predictions
        """
        customer_data = {
            'age': age,
            'gender': gender,
            'country': country,
            'loyalty_member': loyalty_member,
            'store_type': store_type
        }
        return self.predict_top_k(customer_data, k=k)

    def info(self):
        print(f"Features: {self.features}")
        print(f"Total samples in dataset: {len(self.dataset) if self.dataset is not None else 'N/A'}")
        print(f"Train accuracy: {self.train_accuracy*100:.2f}%" if self.train_accuracy else "N/A")
        print(f"Training time: {self.training_time:.2f}s" if self.training_time else "N/A")
        if hasattr(self.model, 'support_'):
            print(f"Support vectors: {len(self.model.support_)}")

    def list_customers(self, n=10):
        if self.dataset is not None:
            print(f"\nFirst {n} customers in dataset:")
            cols = [c for c in ['customer_id', 'age', 'gender',
                                'country', 'category'] if c in self.dataset.columns]
            print(self.dataset[cols].head(n))
        else:
            print("Dataset not available in model package.")


predictor = ChocolatePredictor('chocolate_identifier_demographic_qsvc.pkl')
predictor.info()

print()
print("TEST1: woman 35y, France, loyalty yes, Mall")
result = predictor.predict_single(35, 'Female', 'France', 1, 'Mall', k=5)
for i, (name, conf) in enumerate(result, 1):
    print(f"{i}. {name:10s} - {conf:.1f}%")

print()
print("TEST2: man 22y, USA, loyalty no, Online")
result = predictor.predict_single(22, 'Male', 'USA', 0, 'Online', k=5)
for i, (name, conf) in enumerate(result, 1):
    print(f"{i}. {name:10s} - {conf:.1f}%")

print()
print("TEST3: Custom profile")
custom_customer = {
    'age': 60,
    'gender': 'Female',
    'country': 'UK',
    'loyalty_member': 1,
    'store_type': 'Retail'
}
result = predictor.predict_top_k(custom_customer, k=5)
for i, (name, conf) in enumerate(result, 1):
    print(f"{i}. {name:10s} - {conf:.1f}%")

predictor.list_customers(10)
  - Train accuracy: 68.80%
Features: ['age', 'gender_encoded', 'country_encoded', 'loyalty_member', 'store_type_encoded']
Total samples in dataset: 250
Train accuracy: 68.80%
Training time: 187.33s
Support vectors: 249

TEST1: woman 35y, France, loyalty yes, Mall
1. Praline    - 70.6%
2. Milk       - 19.3%
3. Truffle    - 6.8%
4. Dark       - 2.5%
5. White      - 0.8%

TEST2: man 22y, USA, loyalty no, Online
1. Praline    - 67.0%
2. Truffle    - 23.1%
3. Milk       - 7.2%
4. Dark       - 2.0%
5. White      - 0.7%

TEST3: Custom profile
1. Praline    - 67.0%
2. Dark       - 22.6%
3. White      - 7.5%
4. Milk       - 2.2%
5. Truffle    - 0.7%

First 10 customers in dataset:
  customer_id  age  gender    country category
0     C040749   44    Male         UK    White
1     C020161   63  Female  Australia  Praline
2     C048069   35    Male         UK     Milk
3     C047901   37  Female        USA  Praline
4     C033950   57  Female     Canada  Truffle
5     C008918   35    Male     Canada     Dark
6     C002897   55  Female     France     Dark
7     C038072   36  Female        USA     Dark
8     C003786   63  Female  Australia  Praline
9     C043148   26  Female    Germany     Milk
In [8]:
import matplotlib.pyplot as plt

# Kernel matrix = quantum fidelity |⟨φ(x_i)|φ(x_j)⟩|² for all parametres

print("kernel matrix (demographic)...")
K_demo = quantum_kernel.evaluate(x_vec=X_demo_scaled)
print(f"  Shape: {K_demo.shape}")

print("kernel matrix (behavior)...")
K_beh = quantum_kernel_beh.evaluate(x_vec=X_beh_scaled)
print(f"  Shape: {K_beh.shape}")

order = np.argsort(y)
K_demo_sorted = K_demo[order][:, order]
K_beh_sorted  = K_beh[order][:, order]

class_counts = np.bincount(y)
boundaries   = np.cumsum(class_counts)[:-1]
centers      = np.cumsum(class_counts) - class_counts / 2
cat_names    = le_category.inverse_transform(np.arange(len(class_counts)))

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

for ax, K, title in [
    (axes[0], K_demo_sorted, 'Quantum Kernel — Demographic'),
    (axes[1], K_beh_sorted,  'Quantum Kernel — Behavior'),
]:
    im = ax.imshow(K, cmap='viridis', aspect='equal', vmin=0, vmax=1)
    for b in boundaries:
        ax.axhline(b, color='red', linewidth=0.5, alpha=0.6)
        ax.axvline(b, color='red', linewidth=0.5, alpha=0.6)
    ax.set_title(f'{title}\n(by category, n={K.shape[0]})')
    ax.set_xticks(centers); ax.set_xticklabels(cat_names, rotation=45)
    ax.set_yticks(centers); ax.set_yticklabels(cat_names)
    plt.colorbar(im, ax=ax, fraction=0.046, label='fidelity')

plt.tight_layout()
plt.show()

print(f"\nDemographic kernel:  mean={K_demo.mean():.3f}  std={K_demo.std():.3f}  "
      f"diag mean={K_demo.diagonal().mean():.3f}")
print(f"Behavior kernel:     mean={K_beh.mean():.3f}  std={K_beh.std():.3f}  "
      f"diag mean={K_beh.diagonal().mean():.3f}")
kernel matrix (demographic)...
  Shape: (250, 250)
kernel matrix (behavior)...
  Shape: (250, 250)
No description has been provided for this image
Demographic kernel:  mean=0.067  std=0.106  diag mean=1.004
Behavior kernel:     mean=0.036  std=0.095  diag mean=1.002