import math
import random
class Value:
"""
A Value stores a scalar number and its gradient.
It effectively acts as a node in our computational graph.
"""
def __init__(self, data, _children=(), _op='', label=''):
self.data = data # The actual number (e.g., -2, 5, 15)
self.grad = 0.0 # The derivative of L with respect to this value
self._backward = lambda: None # The function to propagate gradient one step back
self._prev = set(_children) # The parent nodes (inputs to the operation)
self._op = _op # The operation that created this node (+, *, etc)
self.label = label # Optional label for debugging (x, y, a, f)
def __repr__(self):
return f"Value(data={self.data:.4f}, grad={self.grad:.4f})"
# -------------------------------------------------------------------
# OPERATOR OVERLOADING
# -------------------------------------------------------------------
def __add__(self, other):
other = other if isinstance(other, Value) else Value(other)
out = Value(self.data + other.data, (self, other), '+')
def _backward():
self.grad += out.grad
other.grad += out.grad
out._backward = _backward
return out
def __mul__(self, other):
other = other if isinstance(other, Value) else Value(other)
out = Value(self.data * other.data, (self, other), '*')
def _backward():
self.grad += other.data * out.grad
other.grad += self.data * out.grad
out._backward = _backward
return out
def __pow__(self, other):
assert isinstance(other, (int, float)), "only supporting int/float powers for now"
out = Value(self.data**other, (self,), f'**{other}')
def _backward():
# Power Rule: d/dx (x^n) = n * x^(n-1)
# Chain Rule: local_grad * upstream_grad
self.grad += (other * self.data**(other-1)) * out.grad
out._backward = _backward
return out
def __neg__(self): return self * -1
def __sub__(self, other): return self + (-other)
def __radd__(self, other): return self + other
def __rmul__(self, other): return self * other
# -------------------------------------------------------------------
# THE BACKPROPAGATION ENGINE
# -------------------------------------------------------------------
def backward(self):
# Clears and rebuilds the computation path for the gradient calculation
topo = []
visited = set()
def build_topo(v):
if v not in visited:
visited.add(v)
for child in v._prev:
build_topo(child)
topo.append(v)
build_topo(self)
# Reset all gradients for a clean backprop run
for node in topo:
node.grad = 0.0
self.grad = 1.0
for node in reversed(topo):
node._backward()
# -------------------------------------------------------------------
# CONCEPTUAL VECTORIZATION HELPERS
# These mimic high-level linear algebra operations
# -------------------------------------------------------------------
def dot(v1: list[Value], v2: list[Value]) -> Value:
"""Calculates the dot product of two Value vectors."""
assert len(v1) == len(v2), "Vectors must be of equal length for dot product."
# Sum of element-wise multiplication: (v1[0]*v2[0] + v1[1]*v2[1] + ...)
products = [a * b for a, b in zip(v1, v2)]
# We must start the sum with a Value(0.0) to keep the graph connected
return sum(products, Value(0.0))
# -------------------------------------------------------------------
# DEMO 1: Simple Scalar Verification
# -------------------------------------------------------------------
def demo_calculation():
print("\n--- Demo 1: Simple Scalar Calculation ---")
x = Value(-2.0, label='x')
y = Value(5.0, label='y')
f = (x + y) * y
f.backward()
print(f"Result: {f.data}, Gradients: x={x.grad}, y={y.grad}")
# -------------------------------------------------------------------
# DEMO 2: Linear Regression Training Loop (Vectorized Structure)
# Goal: Learn the function y = 2x + 10
# -------------------------------------------------------------------
def train_linear_regression():
print("\n--- Demo 2: Training Linear Regression (Vectorized Structure) ---")
# --- Data Generation ---
DATA_SIZE = 50
# Inputs (X) and Targets (Y_true)
X = [Value(i * 0.5) for i in range(DATA_SIZE)]
Y_true = [Value(2.0 * x.data + 10.0) for x in X]
# Apply the BIAS TRICK: Augment the input X with a column of 1s.
# Each data point X_i becomes a vector [x_i, 1]
X_biased = [[x, Value(1.0)] for x in X]
# 2. Initialize Parameters (W - The Weight Vector)
# W = [w, b], where w is the weight and b is the bias
w = Value(random.uniform(-1, 1), label='weight')
b = Value(random.uniform(-1, 1), label='bias')
W = [w, b] # The single parameter vector
print(f"Initial Guess: y = {w.data:.2f}x + {b.data:.2f}")
# 3. Training Loop (Gradient Descent)
learning_rate = 1e-3 # *** FIX: Reduced LR significantly for stability ***
epochs = 5000
for k in range(epochs):
# --- Forward Pass (Vectorized Prediction) ---
preds = [dot(x_vec, W) for x_vec in X_biased]
# Calculate Loss: Sum of Squared Errors (SSE)
losses = [(yp - yt)**2 for yp, yt in zip(preds, Y_true)]
total_loss = sum(losses, Value(0.0))
# *** FIX: Convert SSE to Mean Squared Error (MSE) ***
# We divide the sum of losses by the number of data points.
mean_loss = total_loss * (1.0 / DATA_SIZE)
# --- Backward Pass (AutoDiff) ---
# Reset gradients for all parameters in W
for p in W:
p.grad = 0.0
# Calculate gradients starting from the mean loss
mean_loss.backward()
# --- Optimization Step (Vectorized Update) ---
for p in W:
p.data -= learning_rate * p.grad
if k % 50 == 0:
# Report the Mean Loss, not the huge total loss
print(f"Epoch {k:03d}: MSE Loss={mean_loss.data:.4f}, w={w.data:.3f}, b={b.data:.3f}")
print(f"\nFinal Result (Learned): y = {w.data:.3f}x + {b.data:.3f}")
print(f"Target Function: y = 2.000x + 10.000")
if __name__ == "__main__":
# Reset gradients for the demo run, as the backward function now clears them
x_test = Value(100.0)
y_test = Value(200.0)
(x_test * y_test).backward()
demo_calculation()
train_linear_regression()