from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.ensemble import GradientBoostingClassifier pipe = Pipeline([ ("scaler", StandardScaler()), ("clf", GradientBoostingClassifier( n_estimators=200, learning_rate=0.05 )) ]) pipe.fit(X_train, y_train) score = pipe.score(X_test, y_test)
from sklearn.model_selection import GridSearchCV params = { "clf__n_estimators": [100, 200, 300], "clf__learning_rate": [0.01, 0.05, 0.1], "clf__max_depth": [3, 5, 7], } gs = GridSearchCV(pipe, params, cv=5, scoring="f1_macro", n_jobs=-1) gs.fit(X_train, y_train) print(gs.best_params_, gs.best_score_)
| Metric | Function | Use When |
|---|---|---|
| Accuracy | accuracy_score | Balanced classes |
| F1 Score | f1_score | Imbalanced classes |
| ROC-AUC | roc_auc_score | Ranking quality |
| MSE/RMSE | mean_squared_error | Regression |
| R² | r2_score | Regression fit |
| Log Loss | log_loss | Probability output |
| Algorithm | Best For | Key Params |
|---|---|---|
| LinearRegression | Continuous, linear | fit_intercept |
| LogisticRegression | Binary/multi clf | C, solver |
| RandomForest | Tabular, robust | n_estimators, max_depth |
| XGBoost | Kaggle, tabular | lr, subsample, colsample |
| SVM | High-dim, small data | C, kernel, gamma |
| KNN | Simple baseline | n_neighbors, metric |
| K-Means | Clustering | n_clusters, init |
| DBSCAN | Density clustering | eps, min_samples |
import pandas as pd from sklearn.preprocessing import ( StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder ) df["log_feat"] = np.log1p(df["skewed_col"]) df["interact"] = df["a"] * df["b"] df["binned"] = pd.cut(df["age"], bins=5, labels=False) df = pd.get_dummies(df, columns=["category"], drop_first=True)
from sklearn.model_selection import ( train_test_split, StratifiedKFold, TimeSeriesSplit ) X_tv, X_test, y_tv, y_test = train_test_split( X, y, test_size=0.15, stratify=y, random_state=42 ) X_train, X_val, y_train, y_val = train_test_split( X_tv, y_tv, test_size=0.18, stratify=y_tv )
from sklearn.decomposition import PCA from sklearn.manifold import TSNE pca = PCA(n_components=0.95) X_pca = pca.fit_transform(X_scaled) print(f"Dims: {X_pca.shape[1]}, var: {pca.explained_variance_ratio_.sum():.2%}") tsne = TSNE(n_components=2, perplexity=30, random_state=42) X_2d = tsne.fit_transform(X_pca)
import torch.nn as nn class Net(nn.Module): def __init__(self): super().__init__() self.layers = nn.Sequential( nn.Linear(128, 256), nn.LayerNorm(256), nn.GELU(), nn.Dropout(0.1), nn.Linear(256, 10) ) def forward(self, x): return self.layers(x)
for epoch in range(epochs): model.train() for xb, yb in train_loader: xb, yb = xb.to(device), yb.to(device) pred = model(xb) loss = criterion(pred, yb) opt.zero_grad() loss.backward() nn.utils.clip_grad_norm_(model.parameters(), 1.0) opt.step() scheduler.step() model.eval() with torch.no_grad(): val_loss = sum(criterion(model(xb.to(device)), yb.to(device)) for xb, yb in val_loader)
| Optimizer | Best For |
|---|---|
| Adam | General default, fast convergence |
| AdamW | Transformers, proper weight decay |
| SGD + momentum | Vision, fine-tuning, sharp minima |
| Lion | Large models, memory efficient |
opt = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=0.01) sched = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=epochs)
| Loss | Task |
|---|---|
| CrossEntropyLoss | Multi-class classification |
| BCEWithLogitsLoss | Binary / multi-label |
| MSELoss | Regression |
| HuberLoss | Robust regression |
| NLLLoss | Log-prob outputs |
| CTCLoss | Sequence-to-sequence (ASR) |
| TripletMarginLoss | Metric learning |
| Function | Formula | Use |
|---|---|---|
| ReLU | max(0,x) | Hidden layers (default) |
| GELU | x·Φ(x) | Transformers, BERT |
| SiLU/Swish | x·σ(x) | LLaMA, modern nets |
| Sigmoid | 1/(1+e⁻ˣ) | Binary output |
| Tanh | (eˣ−e⁻ˣ)/(eˣ+e⁻ˣ) | RNNs, normalised |
| Softmax | eˣᵢ/Σeˣⱼ | Final classification |
torch.save({
"epoch": epoch,
"model": model.state_dict(),
"optimizer": opt.state_dict(),
"scheduler": sched.state_dict(),
"loss": loss.item(),
}, "checkpoint.pt")
ckpt = torch.load("checkpoint.pt", map_location=device)
model.load_state_dict(ckpt["model"])nn.Dropout(0.3) nn.Dropout2d(0.2) nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
import torch, torch.nn.functional as F def scaled_dot_attention(Q, K, V, mask=None): d_k = Q.size(-1) scores = (Q @ K.transpose(-2, -1)) / d_k ** 0.5 if mask is not None: scores = scores.masked_fill(mask == 0, -1e9) attn = F.softmax(scores, dim=-1) return attn @ V, attn
from transformers import pipeline tasks = { "sentiment": pipeline("sentiment-analysis"), "ner": pipeline("ner", aggregation_strategy="simple"), "summarize": pipeline("summarization", model="facebook/bart-large-cnn"), "qa": pipeline("question-answering"), "translate": pipeline("translation_en_to_fr"), "zero-shot": pipeline("zero-shot-classification"), "fill-mask": pipeline("fill-mask", model="bert-base-uncased"), "text-gen": pipeline("text-generation", model="gpt2"), }
from transformers import ( AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments ) model = AutoModelForSequenceClassification.from_pretrained( "bert-base-uncased", num_labels=3 ) args = TrainingArguments( output_dir="./out", num_train_epochs=3, per_device_train_batch_size=16, learning_rate=2e-5, weight_decay=0.01, evaluation_strategy="epoch", load_best_model_at_end=True ) trainer = Trainer(model=model, args=args, train_dataset=train_ds, eval_dataset=val_ds) trainer.train()
from peft import get_peft_model, LoraConfig, TaskType config = LoraConfig( task_type=TaskType.CAUSAL_LM, r=16, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none" ) model = get_peft_model(base_model, config) model.print_trainable_parameters() # trainable: 0.1% of total params
from langchain_community.vectorstores import FAISS from langchain_openai import OpenAIEmbeddings, ChatOpenAI from langchain.chains import RetrievalQA embeddings = OpenAIEmbeddings() db = FAISS.from_documents(docs, embeddings) retriever = db.as_retriever(search_kwargs={"k": 4}) chain = RetrievalQA.from_chain_type( llm=ChatOpenAI(model="gpt-4o-mini"), retriever=retriever ) result = chain.invoke({"query": question})
| Technique | When to Use |
|---|---|
| Zero-shot | Simple tasks, capable models |
| Few-shot | Format/style matters, examples help |
| Chain-of-Thought | Reasoning, math, multi-step logic |
| Tree-of-Thought | Complex decisions, branching |
| ReAct | Tool use, agents, search |
| Self-Consistency | Reduce variance, vote on answers |
| Role Prompting | Persona, domain expertise |
| Param | Range | Effect |
|---|---|---|
| temperature | 0–2 | 0=deterministic, high=creative |
| top_p | 0–1 | Nucleus sampling mass |
| top_k | 1–∞ | Restrict to top k tokens |
| frequency_penalty | -2–2 | Reduce repetition |
| presence_penalty | -2–2 | Encourage new topics |
| max_tokens | 1–ctx | Output length cap |
| Model | Year | Key Innovation |
|---|---|---|
| AlexNet | 2012 | Deep CNN on GPU, ReLU, Dropout |
| VGG-16/19 | 2014 | Uniform 3×3 convs, depth |
| ResNet-50 | 2015 | Residual skip connections |
| EfficientNet | 2019 | Compound scaling (W×D×R) |
| ViT | 2020 | Patch-based transformer |
| ConvNeXt | 2022 | CNN with transformer design |
| SAM 2 | 2024 | Segment anything, video |
from torchvision import transforms as T train_tfm = T.Compose([ T.RandomResizedCrop(224), T.RandomHorizontalFlip(), T.ColorJitter(0.4, 0.4, 0.4, 0.1), T.RandomGrayscale(p=0.2), T.ToTensor(), T.Normalize([0.485,0.456,0.406], [0.229,0.224,0.225]), ]) val_tfm = T.Compose([ T.Resize(256), T.CenterCrop(224), T.ToTensor(), T.Normalize([0.485,0.456,0.406], [0.229,0.224,0.225]) ])
from torchvision.models import resnet50, ResNet50_Weights model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V2) for param in model.parameters(): param.requires_grad = False model.fc = nn.Sequential( nn.Linear(model.fc.in_features, 256), nn.ReLU(), nn.Dropout(0.3), nn.Linear(256, num_classes) ) opt = torch.optim.Adam(model.fc.parameters(), lr=1e-3)
| Method | Type | Model |
|---|---|---|
| Semantic | Class per pixel | DeepLabV3, SegFormer |
| Instance | Individual objects | Mask R-CNN, YOLACT |
| Panoptic | Semantic+Instance | Panoptic-FPN, DETR |
| Promptable | Any object | SAM, SAM 2 |
| Algorithm | Type | Best For |
|---|---|---|
| Q-Learning | Model-free, off-policy | Discrete tabular |
| DQN | Deep, off-policy | Discrete, Atari |
| DDQN | Deep, off-policy | Overestimation fix |
| A3C/A2C | Policy gradient | Parallel envs |
| PPO | On-policy, clip | Continuous actions |
| SAC | Off-policy, entropy | Continuous, robust |
| TD3 | Deterministic PG | Robotics, continuous |
import gymnasium as gym env = gym.make("CartPole-v1", render_mode="rgb_array") obs, info = env.reset(seed=42) for _ in range(500): action = env.action_space.sample() obs, reward, terminated, truncated, info = env.step(action) if terminated or truncated: obs, info = env.reset() env.close()
from stable_baselines3 import PPO, SAC from stable_baselines3.common.env_util import make_vec_env env = make_vec_env("LunarLander-v2", n_envs=4) model = PPO("MlpPolicy", env, n_steps=2048, batch_size=64, learning_rate=3e-4, verbose=1) model.learn(total_timesteps=500_000) model.save("ppo_lunar")
| Distribution | Params | Use in AI |
|---|---|---|
| Gaussian N(μ,σ²) | μ, σ | Weight init, VAE latent |
| Bernoulli | p | Binary classification |
| Categorical | p₁..pₖ | Token sampling, Softmax |
| Dirichlet | α | Topic models, LDA |
| Poisson | λ | Count data modeling |
| Beta | α, β | Bayesian priors, Thompson |
| Metric | Formula | Use |
|---|---|---|
| Euclidean | √Σ(aᵢ−bᵢ)² | KNN, KMeans |
| Cosine | a·b / (‖a‖‖b‖) | Embeddings, NLP |
| Manhattan | Σ|aᵢ−bᵢ| | Sparse, robust |
| KL Divergence | Σ P log(P/Q) | VAE, distributions |
| Mahalanobis | √(a-b)ᵀΣ⁻¹(a-b) | Anomaly detection |
import numpy as np A = np.random.randn(4, 4) U, S, Vt = np.linalg.svd(A) eigenvals = np.linalg.eigvals(A) rank = np.linalg.matrix_rank(A) x = np.linalg.solve(A, b) inv_A = np.linalg.inv(A) det_A = np.linalg.det(A) norm_A = np.linalg.norm(A, ord="fro")
| Library | Purpose | Install |
|---|---|---|
| numpy | Array math, linear algebra | pip install numpy |
| pandas | DataFrames, data wrangling | pip install pandas |
| torch | Deep learning framework | pip install torch |
| transformers | Pretrained models, NLP | pip install transformers |
| scikit-learn | Classical ML, preprocessing | pip install scikit-learn |
| xgboost | Gradient boosting | pip install xgboost |
| langchain | LLM pipelines, agents | pip install langchain |
| gymnasium | RL environments | pip install gymnasium |
| diffusers | Diffusion models (HF) | pip install diffusers |
| faiss-cpu | Vector similarity search | pip install faiss-cpu |
| einops | Tensor rearranging | pip install einops |
| accelerate | Multi-GPU, mixed precision | pip install accelerate |
from accelerate import Accelerator accelerator = Accelerator(mixed_precision="bf16") model, optimizer, train_loader = accelerator.prepare( model, optimizer, train_loader ) with accelerator.autocast(): outputs = model(batch) loss = criterion(outputs, targets) accelerator.backward(loss) optimizer.step()
import wandb wandb.init(project="my-model", config={ "lr": 3e-4, "epochs": 20, "batch_size": 64, "arch": "resnet50" }) wandb.log({"loss": loss, "acc": acc, "epoch": epoch}) wandb.watch(model, log="all", log_freq=100) wandb.finish()
import faiss, numpy as np d = 768 index = faiss.IndexFlatIP(d) faiss.normalize_L2(vectors) index.add(vectors) query = embed_text("What is attention?") faiss.normalize_L2(query) D, I = index.search(query, k=5) print(f"Top-5 ids: {I[0]}, scores: {D[0]}")
torch.cuda.empty_cache() torch.backends.cuda.matmul.allow_tf32 = True model = model.half() with torch.autocast(device_type="cuda", dtype=torch.bfloat16): output = model(input) torch.utils.checkpoint.checkpoint(fn, *args)
| Provider | Top Models | Context |
|---|---|---|
| OpenAI | GPT-4o, o3, o4-mini | 128K |
| Anthropic | Claude 3.5, Claude 3 Opus | 200K |
| Gemini 2.0, Gemini 1.5 Pro | 1M | |
| Meta (OSS) | LLaMA 3.1, 3.3 (405B) | 128K |
| Mistral | Mistral Large, Mixtral 8x22B | 64K |
| Groq | LLaMA 3 (fast inference) | 128K |