{}
run-icon
main.py
import math import random class Value: """ A Value stores a scalar number and its gradient. It effectively acts as a node in our computational graph. """ def __init__(self, data, _children=(), _op='', label=''): self.data = data # The actual number (e.g., -2, 5, 15) self.grad = 0.0 # The derivative of L with respect to this value self._backward = lambda: None # The function to propagate gradient one step back self._prev = set(_children) # The parent nodes (inputs to the operation) self._op = _op # The operation that created this node (+, *, etc) self.label = label # Optional label for debugging (x, y, a, f) def __repr__(self): return f"Value(data={self.data:.4f}, grad={self.grad:.4f})" # ------------------------------------------------------------------- # OPERATOR OVERLOADING # ------------------------------------------------------------------- def __add__(self, other): other = other if isinstance(other, Value) else Value(other) out = Value(self.data + other.data, (self, other), '+') def _backward(): self.grad += out.grad other.grad += out.grad out._backward = _backward return out def __mul__(self, other): other = other if isinstance(other, Value) else Value(other) out = Value(self.data * other.data, (self, other), '*') def _backward(): self.grad += other.data * out.grad other.grad += self.data * out.grad out._backward = _backward return out def __pow__(self, other): assert isinstance(other, (int, float)), "only supporting int/float powers for now" out = Value(self.data**other, (self,), f'**{other}') def _backward(): # Power Rule: d/dx (x^n) = n * x^(n-1) # Chain Rule: local_grad * upstream_grad self.grad += (other * self.data**(other-1)) * out.grad out._backward = _backward return out def __neg__(self): return self * -1 def __sub__(self, other): return self + (-other) def __radd__(self, other): return self + other def __rmul__(self, other): return self * other # ------------------------------------------------------------------- # THE BACKPROPAGATION ENGINE # ------------------------------------------------------------------- def backward(self): # Clears and rebuilds the computation path for the gradient calculation topo = [] visited = set() def build_topo(v): if v not in visited: visited.add(v) for child in v._prev: build_topo(child) topo.append(v) build_topo(self) # Reset all gradients for a clean backprop run for node in topo: node.grad = 0.0 self.grad = 1.0 for node in reversed(topo): node._backward() # ------------------------------------------------------------------- # CONCEPTUAL VECTORIZATION HELPERS # These mimic high-level linear algebra operations # ------------------------------------------------------------------- def dot(v1: list[Value], v2: list[Value]) -> Value: """Calculates the dot product of two Value vectors.""" assert len(v1) == len(v2), "Vectors must be of equal length for dot product." # Sum of element-wise multiplication: (v1[0]*v2[0] + v1[1]*v2[1] + ...) products = [a * b for a, b in zip(v1, v2)] # We must start the sum with a Value(0.0) to keep the graph connected return sum(products, Value(0.0)) # ------------------------------------------------------------------- # DEMO 1: Simple Scalar Verification # ------------------------------------------------------------------- def demo_calculation(): print("\n--- Demo 1: Simple Scalar Calculation ---") x = Value(-2.0, label='x') y = Value(5.0, label='y') f = (x + y) * y f.backward() print(f"Result: {f.data}, Gradients: x={x.grad}, y={y.grad}") # ------------------------------------------------------------------- # DEMO 2: Linear Regression Training Loop (Vectorized Structure) # Goal: Learn the function y = 2x + 10 # ------------------------------------------------------------------- def train_linear_regression(): print("\n--- Demo 2: Training Linear Regression (Vectorized Structure) ---") # --- Data Generation --- DATA_SIZE = 50 # Inputs (X) and Targets (Y_true) X = [Value(i * 0.5) for i in range(DATA_SIZE)] Y_true = [Value(2.0 * x.data + 10.0) for x in X] # Apply the BIAS TRICK: Augment the input X with a column of 1s. # Each data point X_i becomes a vector [x_i, 1] X_biased = [[x, Value(1.0)] for x in X] # 2. Initialize Parameters (W - The Weight Vector) # W = [w, b], where w is the weight and b is the bias w = Value(random.uniform(-1, 1), label='weight') b = Value(random.uniform(-1, 1), label='bias') W = [w, b] # The single parameter vector print(f"Initial Guess: y = {w.data:.2f}x + {b.data:.2f}") # 3. Training Loop (Gradient Descent) learning_rate = 1e-3 # *** FIX: Reduced LR significantly for stability *** epochs = 5000 for k in range(epochs): # --- Forward Pass (Vectorized Prediction) --- preds = [dot(x_vec, W) for x_vec in X_biased] # Calculate Loss: Sum of Squared Errors (SSE) losses = [(yp - yt)**2 for yp, yt in zip(preds, Y_true)] total_loss = sum(losses, Value(0.0)) # *** FIX: Convert SSE to Mean Squared Error (MSE) *** # We divide the sum of losses by the number of data points. mean_loss = total_loss * (1.0 / DATA_SIZE) # --- Backward Pass (AutoDiff) --- # Reset gradients for all parameters in W for p in W: p.grad = 0.0 # Calculate gradients starting from the mean loss mean_loss.backward() # --- Optimization Step (Vectorized Update) --- for p in W: p.data -= learning_rate * p.grad if k % 50 == 0: # Report the Mean Loss, not the huge total loss print(f"Epoch {k:03d}: MSE Loss={mean_loss.data:.4f}, w={w.data:.3f}, b={b.data:.3f}") print(f"\nFinal Result (Learned): y = {w.data:.3f}x + {b.data:.3f}") print(f"Target Function: y = 2.000x + 10.000") if __name__ == "__main__": # Reset gradients for the demo run, as the backward function now clears them x_test = Value(100.0) y_test = Value(200.0) (x_test * y_test).backward() demo_calculation() train_linear_regression()
Output