Day 1 - Part 1: Getting Started with ML
Master 2 (203) in Financial Markets, Paris Dauphine - PSL University
2025-10-31
Installing Scikit-learn:
Verify installation:
from sklearn import datasets # Built-in datasets
from sklearn import preprocessing # Data preprocessing
from sklearn import model_selection # Train/test split
from sklearn import linear_model # Linear models
from sklearn import tree # Decision trees
from sklearn import ensemble # Ensemble methods
from sklearn import metrics # Evaluation metricsProblem: Approximate a non-linear function
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
# Generate data: y = 2x + 1 + noise
X = np.linspace(0, 10, 100).reshape(-1, 1)
y = 2 * X.ravel() + 1 + np.random.randn(100) * 0.5
# Create and train model
model = LinearRegression()
model.fit(X, y)
# Predict
y_pred = model.predict(X)
print(f"Coefficient: {model.coef_[0]:.2f}")
print(f"Intercept: {model.intercept_:.2f}")Approximating non-linear functions:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
# Generate non-linear data: y = sin(x)
X = np.linspace(0, 10, 100).reshape(-1, 1)
y = np.sin(X).ravel() + np.random.randn(100) * 0.1
# Create polynomial regression model (degree 5)
poly_model = make_pipeline(
PolynomialFeatures(degree=5),
LinearRegression()
)
poly_model.fit(X, y)
y_poly_pred = poly_model.predict(X)Essential for evaluating model performance:
from sklearn.model_selection import train_test_split
# Generate data
X = np.random.randn(1000, 5) # 1000 samples, 5 features
y = X[:, 0] * 2 + X[:, 1] * 3 + np.random.randn(1000) * 0.5
# Split: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")from sklearn.metrics import mean_squared_error, r2_score
# Train model
model = LinearRegression()
model.fit(X_train, y_train)
# Predict on test set
y_pred = model.predict(X_test)
# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.4f}")
print(f"R² Score: {r2:.4f}")Binary classification with Logistic Regression:
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
# Generate 2D classification data
X, y = make_classification(
n_samples=200, n_features=2, n_redundant=0,
n_informative=2, n_clusters_per_class=1,
random_state=42
)
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)# Create and train classifier
clf = LogisticRegression()
clf.fit(X_train, y_train)
# Predict
y_pred = clf.predict(X_test)
# Predict probabilities
y_proba = clf.predict_proba(X_test)
print(f"Training accuracy: {clf.score(X_train, y_train):.3f}")
print(f"Test accuracy: {clf.score(X_test, y_test):.3f}")from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, f1_score
from sklearn.metrics import confusion_matrix
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-Score: {f1:.3f}")
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{cm}")import matplotlib.pyplot as plt
# Create mesh for decision boundary
h = 0.02 # step size
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(
np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h)
)
# Predict for each point in mesh
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# Plot
plt.contourf(xx, yy, Z, alpha=0.3)
plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k')
plt.title('Decision Boundary')
plt.show()Better evaluation with k-fold cross-validation:
Important for many algorithms:
from sklearn.preprocessing import StandardScaler
# Create scaler
scaler = StandardScaler()
# Fit on training data, transform both train and test
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train model on scaled data
clf_scaled = LogisticRegression()
clf_scaled.fit(X_train_scaled, y_train)
print(f"Accuracy with scaling: {clf_scaled.score(X_test_scaled, y_test):.3f}")Non-linear classification:
from sklearn.tree import DecisionTreeClassifier
# Create and train decision tree
tree_clf = DecisionTreeClassifier(max_depth=3, random_state=42)
tree_clf.fit(X_train, y_train)
# Evaluate
train_score = tree_clf.score(X_train, y_train)
test_score = tree_clf.score(X_test, y_test)
print(f"Training accuracy: {train_score:.3f}")
print(f"Test accuracy: {test_score:.3f}")Ensemble of decision trees:
from sklearn.ensemble import RandomForestClassifier
# Create random forest
rf_clf = RandomForestClassifier(
n_estimators=100, # number of trees
max_depth=5,
random_state=42
)
rf_clf.fit(X_train, y_train)
print(f"Random Forest accuracy: {rf_clf.score(X_test, y_test):.3f}")
# Feature importance
print(f"Feature importances: {rf_clf.feature_importances_}")fit(), predict(), score()✅ Use for:
❌ Not ideal for:
Resources: