In [1]:
import pandas as pd
import numpy as np
import dill
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from qiskit.circuit.library import ZZFeatureMap
from qiskit_machine_learning.kernels import FidelityQuantumKernel
from qiskit_machine_learning.algorithms import QSVC
from qiskit_algorithms.utils import algorithm_globals
from qiskit_algorithms.state_fidelities import ComputeUncompute
from qiskit_aer import AerSimulator
from qiskit.primitives import BackendSamplerV2
from qiskit.transpiler.preset_passmanagers import generate_preset_pass_manager
import warnings
import time
warnings.filterwarnings('ignore')
algorithm_globals.random_seed = 42
SAMPLE_SIZE = 250
sales = pd.read_csv('sales.csv')
customers = pd.read_csv('customers.csv')
products = pd.read_csv('products.csv')
stores = pd.read_csv('stores.csv')
df = (sales
.merge(customers, on='customer_id', how='left')
.merge(products, on='product_id', how='left')
.merge(stores, on='store_id', how='left'))
df['age'] = df['age'].fillna(df['age'].median())
df['gender'] = df['gender'].fillna('Unknown')
df['loyalty_member'] = df['loyalty_member'].fillna(0).astype(int)
df = df.dropna(subset=['category', 'brand', 'cocoa_percent', 'weight_g'])
le_gender = LabelEncoder()
le_country = LabelEncoder()
le_store_type = LabelEncoder()
le_category = LabelEncoder()
df['gender_encoded'] = le_gender.fit_transform(df['gender'].astype(str))
df['country_encoded'] = le_country.fit_transform(df['country'].astype(str))
df['store_type_encoded'] = le_store_type.fit_transform(df['store_type'].astype(str))
df['category_id'] = le_category.fit_transform(df['category'].astype(str))
df['avg_spend'] = df['revenue'] / df['quantity']
df['is_premium'] = (df['unit_price'] >= df['unit_price'].median()).astype(int)
df['is_discount'] = (df['discount'] > 0).astype(int)
demographic_features = ['age', 'gender_encoded', 'country_encoded',
'loyalty_member', 'store_type_encoded']
behavior_features = ['quantity', 'unit_price', 'discount',
'avg_spend', 'is_premium', 'is_discount']
df_sample = df.head(SAMPLE_SIZE).copy()
y = df_sample['category_id'].values
backend = AerSimulator(
method='statevector',
device='CPU',
max_parallel_threads=4,
max_parallel_experiments=1,
precision='double',
max_memory_mb=8192,
fusion_enable=True,
fusion_verbose=False,
fusion_max_qubit=5,
fusion_threshold=14)
pass_manager = generate_preset_pass_manager(
backend=backend,
optimization_level=2,
seed_transpiler=42)
sampler = BackendSamplerV2(
backend=backend,
options={
"default_shots": 1024,
"seed_simulator": 42
})
X_demo = df_sample[demographic_features].values
scaler_demo = StandardScaler()
X_demo_scaled = scaler_demo.fit_transform(X_demo)
X_train, X_test, y_train, y_test = train_test_split(
X_demo_scaled, y, test_size=0.2, random_state=42
)
num_features = len(demographic_features)
feature_map = ZZFeatureMap(
feature_dimension=num_features,
reps=2,
entanglement='linear')
print(f" - Features: {num_features}")
print(f" - Qubits: {feature_map.num_qubits}")
print(f" - Gates: {feature_map.count_ops()}")
fidelity = ComputeUncompute(
sampler=sampler,
transpiler=pass_manager)
quantum_kernel = FidelityQuantumKernel(
feature_map=feature_map,
fidelity=fidelity)
qsvc_demo = QSVC(quantum_kernel=quantum_kernel)
start_time = time.time()
qsvc_demo.fit(X_demo_scaled, y)
train_time = time.time() - start_time
train_acc = qsvc_demo.score(X_demo_scaled, y)
print(f" - Accuracy: {train_acc:.4f} ({train_acc*100:.2f}%)")
print(f" - Training time: {train_time:.2f}s")
print(f" - Support vectors: {qsvc_demo.support_vectors_.shape[0]}")
pkg_demo = {
'model': qsvc_demo,
'scaler': scaler_demo,
'features': demographic_features,
'le_category': le_category,
'le_gender': le_gender,
'le_country': le_country,
'le_store_type': le_store_type,
'dataset': df_sample[['customer_id', 'age', 'gender', 'country',
'loyalty_member', 'store_type', 'category']],
'type': 'demographic_quantum_aer_transpiled',
'feature_map': feature_map,
'backend_name': backend.name,
'optimization_level': 2,
'training_time': train_time,
'accuracy': train_acc
}
with open('chocolate_identifier_demographic_qsvc.pkl', 'wb') as f:
dill.dump(pkg_demo, f)
X_beh = df_sample[behavior_features].values
scaler_beh = StandardScaler()
X_beh_scaled = scaler_beh.fit_transform(X_beh)
num_features_beh = len(behavior_features)
feature_map_beh = ZZFeatureMap(
feature_dimension=num_features_beh,
reps=2,
entanglement='linear')
print(f" - Features: {num_features_beh}")
print(f" - Qubits: {feature_map_beh.num_qubits}")
print(f" - Gates: {feature_map_beh.count_ops()}")
backend_beh = AerSimulator(
method='statevector',
device='CPU',
max_parallel_threads=4,
fusion_enable=True)
pass_manager_beh = generate_preset_pass_manager(
backend=backend_beh,
optimization_level=3,
seed_transpiler=42)
sampler_beh = BackendSamplerV2(
backend=backend_beh,
options={
"default_shots": 2048,
"seed_simulator": 42
})
fidelity_beh = ComputeUncompute(
sampler=sampler_beh,
transpiler=pass_manager_beh)
quantum_kernel_beh = FidelityQuantumKernel(
feature_map=feature_map_beh,
fidelity=fidelity_beh)
qsvc_beh = QSVC(quantum_kernel=quantum_kernel_beh)
start_time_beh = time.time()
qsvc_beh.fit(X_beh_scaled, y)
train_time_beh = time.time() - start_time_beh
train_acc_beh = qsvc_beh.score(X_beh_scaled, y)
print(f" - Accuracy: {train_acc_beh:.4f} ({train_acc_beh*100:.2f}%)")
print(f" - Training time: {train_time_beh:.2f}s")
print(f" - Support vectors: {qsvc_beh.support_vectors_.shape[0]}")
pkg_beh = {
'model': qsvc_beh,
'scaler': scaler_beh,
'features': behavior_features,
'le_category': le_category,
'dataset': df_sample[['customer_id'] + behavior_features + ['category']],
'type': 'behavior_quantum_aer_transpiled',
'feature_map': feature_map_beh,
'backend_name': backend_beh.name,
'optimization_level': 3,
'training_time': train_time_beh,
'accuracy': train_acc_beh
}
with open('chocolate_identifier_behavior_qsvc.pkl', 'wb') as f:
dill.dump(pkg_beh, f)
- Features: 5
- Qubits: 5
- Gates: OrderedDict({'ZZFeatureMap': 1})
- Accuracy: 0.6880 (68.80%)
- Training time: 187.33s
- Support vectors: 0
- Features: 6
- Qubits: 6
- Gates: OrderedDict({'ZZFeatureMap': 1})
- Accuracy: 0.7480 (74.80%)
- Training time: 336.65s
- Support vectors: 0
In [2]:
def predict_top_k_svm(model, scaler, X, le_category, k=5):
X_scaled = scaler.transform(X)
decision_scores = model.decision_function(X_scaled)
if decision_scores.ndim == 1:
decision_scores = decision_scores.reshape(-1, 1)
results = []
for i in range(len(X)):
scores = decision_scores[i]
n = len(scores)
k_eff = min(k, n)
top_k_indices = np.argsort(scores)[::-1][:k_eff]
top_k_scores = scores[top_k_indices]
exp_scores = np.exp(top_k_scores - np.max(top_k_scores))
confidences = 100 * exp_scores / exp_scores.sum()
category_names = le_category.inverse_transform(top_k_indices)
results.append(list(zip(category_names, confidences)))
return results
In [3]:
print("\nTEST 1: 35y woman, France, loyalty yes, Mall")
g_enc = le_gender.transform(['Female'])[0]
c_enc = le_country.transform(['France'])[0]
s_enc = le_store_type.transform(['Mall'])[0]
X_test = np.array([[35, g_enc, c_enc, 1, s_enc]])
top_k = predict_top_k_svm(qsvc_demo, scaler_demo, X_test, le_category, k=5)
print("\n Top 5:")
for i, (name, conf) in enumerate(top_k[0], 1):
print(f" {i}. {name:10s} (confidence: {conf:.1f}%)")
TEST 1: 35y woman, France, loyalty yes, Mall
Top 5:
1. Praline (confidence: 70.6%)
2. Milk (confidence: 19.3%)
3. Truffle (confidence: 6.8%)
4. Dark (confidence: 2.5%)
5. White (confidence: 0.8%)
In [4]:
print("\nTEST 2: 22y man, USA, loyalty no, Online")
g_enc = le_gender.transform(['Male'])[0]
c_enc = le_country.transform(['USA'])[0]
s_enc = le_store_type.transform(['Online'])[0]
X_test = np.array([[22, g_enc, c_enc, 0, s_enc]])
top_k = predict_top_k_svm(qsvc_demo, scaler_demo, X_test, le_category, k=5)
print("\n Top 5:")
for i, (name, conf) in enumerate(top_k[0], 1):
print(f" {i}. {name:10s} (confidence: {conf:.1f}%)")
TEST 2: 22y man, USA, loyalty no, Online
Top 5:
1. Praline (confidence: 67.0%)
2. Truffle (confidence: 23.1%)
3. Milk (confidence: 7.2%)
4. Dark (confidence: 2.0%)
5. White (confidence: 0.7%)
In [5]:
print("\nTEST 3: Identification by behavior (3 items @ 18.50, discount 0.10)")
qty, price, disc = 3, 18.50, 0.10
avg_spend = price * (1 - disc)
is_prem = 1 if price >= 12 else 0
is_disc = 1 if disc > 0 else 0
X_test = np.array([[qty, price, disc, avg_spend, is_prem, is_disc]])
top_k = predict_top_k_svm(qsvc_beh, scaler_beh, X_test, le_category, k=5)
print("\n Top 5:")
for i, (name, conf) in enumerate(top_k[0], 1):
print(f" {i}. {name:10s} (confidence: {conf:.1f}%)")
TEST 3: Identification by behavior (3 items @ 18.50, discount 0.10)
Top 5:
1. Praline (confidence: 67.1%)
2. Dark (confidence: 22.6%)
3. Milk (confidence: 7.0%)
4. White (confidence: 2.4%)
5. Truffle (confidence: 0.9%)
In [6]:
print("\nTEST 4: 60y woman, UK, loyalty yes, Retail")
g_enc = le_gender.transform(['Female'])[0]
c_enc = le_country.transform(['UK'])[0]
s_enc = le_store_type.transform(['Retail'])[0]
X_test = np.array([[60, g_enc, c_enc, 1, s_enc]])
top_k = predict_top_k_svm(qsvc_demo, scaler_demo, X_test, le_category, k=5)
print("\n Top 5:")
for i, (name, conf) in enumerate(top_k[0], 1):
print(f" {i}. {name:10s} (confidence: {conf:.1f}%)")
TEST 4: 60y woman, UK, loyalty yes, Retail
Top 5:
1. Praline (confidence: 67.0%)
2. Dark (confidence: 22.6%)
3. White (confidence: 7.5%)
4. Milk (confidence: 2.2%)
5. Truffle (confidence: 0.7%)
In [7]:
#You can use this cell on its own for prediction
import dill
import numpy as np
import pandas as pd
class ChocolatePredictor:
def __init__(self, model_path):
self.model_path = model_path
self._load_model()
def _load_model(self):
with open(self.model_path, 'rb') as f:
pkg = dill.load(f)
self.model = pkg['model']
self.scaler = pkg['scaler']
self.le_category = pkg['le_category']
self.le_gender = pkg.get('le_gender')
self.le_country = pkg.get('le_country')
self.le_store_type = pkg.get('le_store_type')
self.features = pkg['features']
self.dataset = pkg.get('dataset')
self.train_accuracy = pkg.get('accuracy')
self.training_time = pkg.get('training_time')
if self.train_accuracy:
print(f" - Train accuracy: {self.train_accuracy*100:.2f}%")
def _prepare_features(self, customer_data):
if isinstance(customer_data, dict):
age = customer_data['age']
gender = customer_data['gender']
country = customer_data['country']
loyalty = customer_data.get('loyalty_member', 0)
store_t = customer_data.get('store_type', 'Retail')
g_enc = self.le_gender.transform([gender])[0]
c_enc = self.le_country.transform([country])[0]
s_enc = self.le_store_type.transform([store_t])[0]
return np.array([[age, g_enc, c_enc, loyalty, s_enc]])
elif isinstance(customer_data, pd.DataFrame):
return customer_data[self.features].values
else:
return customer_data
def predict_top_k(self, customer_data, k=5, show_scores=False):
"""
Args:
customer_data: dict, DataFrame or array
k: number of predictions
show_scores: print decision scores
Returns:
list: [(category_name, confidence), ...]
"""
X = self._prepare_features(customer_data)
X_scaled = self.scaler.transform(X)
decision_scores = self.model.decision_function(X_scaled)
if decision_scores.ndim == 1:
decision_scores = decision_scores.reshape(-1, 1)
if show_scores:
print(f"Decision scores shape: {decision_scores.shape}")
results = []
for i in range(len(X)):
scores = decision_scores[i]
k_eff = min(k, len(scores))
top_k_indices = np.argsort(scores)[::-1][:k_eff]
top_k_scores = scores[top_k_indices]
exp_scores = np.exp(top_k_scores - np.max(top_k_scores))
confidences = 100 * exp_scores / exp_scores.sum()
category_names = self.le_category.inverse_transform(top_k_indices)
results.append(list(zip(category_names, confidences)))
return results[0] if len(results) == 1 else results
def predict_single(self, age, gender, country, loyalty_member=0,
store_type='Retail', k=5):
"""
Args:
age: age
gender: 'Male' / 'Female'
country: country
loyalty_member: 0/1
store_type: 'Retail' / 'Mall' / 'Airport' / 'Online'
k: number of predictions
"""
customer_data = {
'age': age,
'gender': gender,
'country': country,
'loyalty_member': loyalty_member,
'store_type': store_type
}
return self.predict_top_k(customer_data, k=k)
def info(self):
print(f"Features: {self.features}")
print(f"Total samples in dataset: {len(self.dataset) if self.dataset is not None else 'N/A'}")
print(f"Train accuracy: {self.train_accuracy*100:.2f}%" if self.train_accuracy else "N/A")
print(f"Training time: {self.training_time:.2f}s" if self.training_time else "N/A")
if hasattr(self.model, 'support_'):
print(f"Support vectors: {len(self.model.support_)}")
def list_customers(self, n=10):
if self.dataset is not None:
print(f"\nFirst {n} customers in dataset:")
cols = [c for c in ['customer_id', 'age', 'gender',
'country', 'category'] if c in self.dataset.columns]
print(self.dataset[cols].head(n))
else:
print("Dataset not available in model package.")
predictor = ChocolatePredictor('chocolate_identifier_demographic_qsvc.pkl')
predictor.info()
print()
print("TEST1: woman 35y, France, loyalty yes, Mall")
result = predictor.predict_single(35, 'Female', 'France', 1, 'Mall', k=5)
for i, (name, conf) in enumerate(result, 1):
print(f"{i}. {name:10s} - {conf:.1f}%")
print()
print("TEST2: man 22y, USA, loyalty no, Online")
result = predictor.predict_single(22, 'Male', 'USA', 0, 'Online', k=5)
for i, (name, conf) in enumerate(result, 1):
print(f"{i}. {name:10s} - {conf:.1f}%")
print()
print("TEST3: Custom profile")
custom_customer = {
'age': 60,
'gender': 'Female',
'country': 'UK',
'loyalty_member': 1,
'store_type': 'Retail'
}
result = predictor.predict_top_k(custom_customer, k=5)
for i, (name, conf) in enumerate(result, 1):
print(f"{i}. {name:10s} - {conf:.1f}%")
predictor.list_customers(10)
- Train accuracy: 68.80% Features: ['age', 'gender_encoded', 'country_encoded', 'loyalty_member', 'store_type_encoded'] Total samples in dataset: 250 Train accuracy: 68.80% Training time: 187.33s Support vectors: 249 TEST1: woman 35y, France, loyalty yes, Mall 1. Praline - 70.6% 2. Milk - 19.3% 3. Truffle - 6.8% 4. Dark - 2.5% 5. White - 0.8% TEST2: man 22y, USA, loyalty no, Online 1. Praline - 67.0% 2. Truffle - 23.1% 3. Milk - 7.2% 4. Dark - 2.0% 5. White - 0.7% TEST3: Custom profile 1. Praline - 67.0% 2. Dark - 22.6% 3. White - 7.5% 4. Milk - 2.2% 5. Truffle - 0.7% First 10 customers in dataset: customer_id age gender country category 0 C040749 44 Male UK White 1 C020161 63 Female Australia Praline 2 C048069 35 Male UK Milk 3 C047901 37 Female USA Praline 4 C033950 57 Female Canada Truffle 5 C008918 35 Male Canada Dark 6 C002897 55 Female France Dark 7 C038072 36 Female USA Dark 8 C003786 63 Female Australia Praline 9 C043148 26 Female Germany Milk
In [8]:
import matplotlib.pyplot as plt
# Kernel matrix = quantum fidelity |⟨φ(x_i)|φ(x_j)⟩|² for all parametres
print("kernel matrix (demographic)...")
K_demo = quantum_kernel.evaluate(x_vec=X_demo_scaled)
print(f" Shape: {K_demo.shape}")
print("kernel matrix (behavior)...")
K_beh = quantum_kernel_beh.evaluate(x_vec=X_beh_scaled)
print(f" Shape: {K_beh.shape}")
order = np.argsort(y)
K_demo_sorted = K_demo[order][:, order]
K_beh_sorted = K_beh[order][:, order]
class_counts = np.bincount(y)
boundaries = np.cumsum(class_counts)[:-1]
centers = np.cumsum(class_counts) - class_counts / 2
cat_names = le_category.inverse_transform(np.arange(len(class_counts)))
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
for ax, K, title in [
(axes[0], K_demo_sorted, 'Quantum Kernel — Demographic'),
(axes[1], K_beh_sorted, 'Quantum Kernel — Behavior'),
]:
im = ax.imshow(K, cmap='viridis', aspect='equal', vmin=0, vmax=1)
for b in boundaries:
ax.axhline(b, color='red', linewidth=0.5, alpha=0.6)
ax.axvline(b, color='red', linewidth=0.5, alpha=0.6)
ax.set_title(f'{title}\n(by category, n={K.shape[0]})')
ax.set_xticks(centers); ax.set_xticklabels(cat_names, rotation=45)
ax.set_yticks(centers); ax.set_yticklabels(cat_names)
plt.colorbar(im, ax=ax, fraction=0.046, label='fidelity')
plt.tight_layout()
plt.show()
print(f"\nDemographic kernel: mean={K_demo.mean():.3f} std={K_demo.std():.3f} "
f"diag mean={K_demo.diagonal().mean():.3f}")
print(f"Behavior kernel: mean={K_beh.mean():.3f} std={K_beh.std():.3f} "
f"diag mean={K_beh.diagonal().mean():.3f}")
kernel matrix (demographic)... Shape: (250, 250) kernel matrix (behavior)... Shape: (250, 250)
Demographic kernel: mean=0.067 std=0.106 diag mean=1.004 Behavior kernel: mean=0.036 std=0.095 diag mean=1.002