Master 1 (203) in Financial Markets
Paris Dauphine - PSL University
2025-10-28
import numpy as np
import time
# Python list
python_list = list(range(1000000))
start = time.time()
result = [x * 2 for x in python_list]
print(f"Python list: {time.time() - start:.4f}s")
# NumPy array
numpy_array = np.arange(1000000)
start = time.time()
result = numpy_array * 2
print(f"NumPy array: {time.time() - start:.4f}s")int8, int16, int32, int64 (signed), uint8, uint16, etc. (unsigned)float16, float32, float64 (default)complex64, complex128bool_[1, 2, 3][[1, 2], [3, 4]]import numpy as np
scalar = np.array(42)
vector = np.array([1, 2, 3, 4])
matrix = np.array([[1, 2, 3], [4, 5, 6]])
tensor = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
print(f"Scalar shape: {scalar.shape}") # ()
print(f"Vector shape: {vector.shape}") # (4,)
print(f"Matrix shape: {matrix.shape}") # (2, 3)
print(f"Tensor shape: {tensor.shape}") # (2, 2, 2)import numpy as np
# From Python list
arr1 = np.array([1, 2, 3, 4, 5])
# Range of values
arr2 = np.arange(0, 10, 2) # [0, 2, 4, 6, 8]
# Evenly spaced values
arr3 = np.linspace(0, 1, 5) # [0.0, 0.25, 0.5, 0.75, 1.0]
# Zeros and ones
zeros = np.zeros((3, 4))
ones = np.ones((2, 3))
# Random values
random_arr = np.random.rand(3, 3) # Uniform [0, 1)
normal_arr = np.random.randn(3, 3) # Standard normal+, -, *, /, ** applied element-by-elementimport numpy as np
# Broadcasting: scalar with array
arr = np.array([1, 2, 3, 4])
result = arr + 10 # [11, 12, 13, 14]
# Broadcasting: different shapes
matrix = np.array([[1, 2, 3], [4, 5, 6]])
vector = np.array([10, 20, 30])
result = matrix + vector
# [[11, 22, 33],
# [14, 25, 36]]
# Broadcasting: column vector
col_vector = np.array([[1], [2]])
result = matrix + col_vector
# [[2, 3, 4],
# [6, 7, 8]]import numpy as np
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])
# Element-wise multiplication
elementwise = A * B # [[5, 12], [21, 32]]
# Matrix multiplication (dot product)
matmul1 = A @ B # [[19, 22], [43, 50]]
matmul2 = np.dot(A, B) # Same result
# Transpose
A_T = A.T # [[1, 3], [2, 4]]
# Inverse (for square matrices)
A_inv = np.linalg.inv(A)import numpy as np
arr = np.array([[1, 2, 3], [4, 5, 6]])
# Statistical functions
print(np.mean(arr)) # 3.5
print(np.std(arr)) # 1.707...
print(np.sum(arr)) # 21
print(np.sum(arr, axis=0)) # [5, 7, 9] (sum columns)
print(np.sum(arr, axis=1)) # [6, 15] (sum rows)
# Other useful functions
print(np.max(arr)) # 6
print(np.argmax(arr)) # 5 (index of max)
print(np.sqrt(arr)) # Element-wise square rootimport pandas as pd
import numpy as np
# From dictionary
df1 = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie'],
'age': [25, 30, 35],
'salary': [50000, 60000, 70000]
})
# From NumPy array
data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
df2 = pd.DataFrame(data, columns=['A', 'B', 'C'])
# From CSV file
df3 = pd.read_csv('data.csv')
# Basic info
print(df1.head()) # First 5 rows
print(df1.info()) # Data types and memory
print(df1.describe()) # Statistical summary.apply() for custom operationsimport pandas as pd
df = pd.DataFrame({
'price': [100, 200, 150],
'quantity': [10, 5, 8],
'discount': [0.1, 0.2, 0.15]
})
# Create new columns from operations
df['total'] = df['price'] * df['quantity']
df['discounted_price'] = df['price'] * (1 - df['discount'])
df['revenue'] = df['discounted_price'] * df['quantity']
# Apply function to column
df['price_category'] = df['price'].apply(
lambda x: 'High' if x > 150 else 'Low'
).loc[]: Label-based indexing.iloc[]: Integer position-based indexingimport pandas as pd
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie', 'David'],
'age': [25, 30, 35, 40],
'city': ['Paris', 'London', 'Berlin', 'Madrid']
}, index=['A', 'B', 'C', 'D'])
# Label-based selection
print(df.loc['A']) # Single row (Series)
print(df.loc[['A', 'C']]) # Multiple rows (DataFrame)
print(df.loc['A':'C']) # Slice (inclusive)
# Position-based selection
print(df.iloc[0]) # First row
print(df.iloc[[0, 2]]) # First and third rows
print(df.iloc[1:3]) # Rows 1 and 2 (exclusive end)& (and), | (or), ~ (not).isin(): Check if values are in a list.between(): Check if values are in a rangeimport pandas as pd
df = pd.DataFrame({
'name': ['Alice', 'Bob', 'Charlie', 'David'],
'age': [25, 30, 35, 40],
'salary': [50000, 60000, 55000, 75000]
})
# Single condition
young = df[df['age'] < 35]
# Multiple conditions (use parentheses!)
filtered = df[(df['age'] > 25) & (df['salary'] > 55000)]
# isin method
selected = df[df['name'].isin(['Alice', 'Charlie'])]
# Query method
result = df.query('age > 30 and salary < 70000')sum(), mean(), count(), min(), max()import pandas as pd
df = pd.DataFrame({
'department': ['Sales', 'IT', 'Sales', 'IT', 'HR'],
'employee': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
'salary': [50000, 60000, 55000, 65000, 52000]
})
# Group by department and calculate mean salary
avg_salary = df.groupby('department')['salary'].mean()
# Multiple aggregations
stats = df.groupby('department').agg({
'salary': ['mean', 'min', 'max', 'count']
})
# Apply custom function
df.groupby('department')['salary'].apply(lambda x: x.max() - x.min())import pandas as pd
import numpy as np
df = pd.DataFrame({
'date': pd.date_range('2025-01-01', periods=100),
'product': np.random.choice(['A', 'B', 'C'], 100),
'region': np.random.choice(['North', 'South'], 100),
'sales': np.random.randint(100, 1000, 100)
})
# Group by multiple columns
multi_group = df.groupby(['product', 'region'])['sales'].sum()
# Transform: keep original shape
df['pct_of_product_total'] = df.groupby('product')['sales'].transform(
lambda x: x / x.sum()
)
# Filter groups
high_sales = df.groupby('product').filter(
lambda x: x['sales'].sum() > 10000
)Dataset: Titanic passenger data from Kaggle
Dataset: Titanic passenger data from Kaggle
Tasks: 1. Load the data and explore its structure 2. Calculate survival rates by passenger class 3. Find average age by gender and survival status 4. Identify which deck had the highest survival rate 5. Create a new feature combining age groups and class
You will practice: - Loading data with Pandas - Selecting rows and columns - GroupBy operations - Creating new features
import pandas as pd
import numpy as np
# Load the Titanic dataset
df = pd.read_csv('titanic.csv')
# Explore the structure
print(df.head())
print(df.info())
print(df.describe())
# Check for missing values
print(df.isnull().sum())
# Basic statistics
print(f"Total passengers: {len(df)}")
print(f"Survival rate: {df['Survived'].mean():.2%}")
print(f"\nPassengers per class:")
print(df['Pclass'].value_counts().sort_index())# Task 1: Calculate survival rates by passenger class
survival_by_class = df.groupby('Pclass')['Survived'].agg([
('count', 'count'),
('survived', 'sum'),
('survival_rate', 'mean')
])
print("Survival rates by class:")
print(survival_by_class)
# Visualization insight:
# 1st class: ~63% survival
# 2nd class: ~47% survival
# 3rd class: ~24% survival
# Clear pattern: higher class = higher survival rate# Task 2: Average age by gender and survival status
age_analysis = df.groupby(['Sex', 'Survived'])['Age'].mean()
print("\nAverage age by gender and survival:")
print(age_analysis)
# More detailed view
age_detail = df.groupby(['Sex', 'Survived']).agg({
'Age': ['mean', 'median', 'std', 'count']
})
print("\nDetailed age statistics:")
print(age_detail)
# Insight: Women and children first policy visible in data
# Younger passengers more likely to survive# Task 3: Survival rate by deck (extracted from Cabin)
# First, extract deck letter from Cabin
df['Deck'] = df['Cabin'].str[0]
# Calculate survival rates by deck
deck_survival = df.groupby('Deck')['Survived'].agg([
'count', 'mean'
]).sort_values('mean', ascending=False)
print("\nSurvival rates by deck:")
print(deck_survival)
# Filter out decks with few passengers for more reliable statistics
reliable_decks = deck_survival[deck_survival['count'] >= 10]
print("\nDecks with 10+ passengers:")
print(reliable_decks)# Task 4: Create age groups and combine with class
# Define age groups
def categorize_age(age):
if pd.isna(age):
return 'Unknown'
elif age < 18:
return 'Child'
elif age < 35:
return 'Young Adult'
elif age < 60:
return 'Adult'
else:
return 'Senior'
df['Age_Group'] = df['Age'].apply(categorize_age)
# Combine with passenger class
df['Class_Age_Group'] = (df['Pclass'].astype(str) + '_' +
df['Age_Group'])
# Analyze survival by this new feature
survival_by_combo = df.groupby('Class_Age_Group')['Survived'].agg([
'count', 'mean'
]).sort_values('mean', ascending=False)
print("\nSurvival rates by class and age group:")
print(survival_by_combo.head(10))