Machine Learning

复习常见的机器学习算法,并加以巩固。

Learning Algorithm

机器学习的学习算法主要分为两类监督学习无监督学习

catelogue

Supervised Learning

Linear Regression

Cost

cost

1
2
3
def cost(Theta, X, Y):
inner = np.power((X @ Theta.T - Y), 2)
return np.sum(inner) / (2 * len(X))

Gradient Descent

1
2
def gradient(Theta, X, Y):
return ((X.T @ (X @ Theta.T - Y)) / len(X)

Logistic Regression

Cost

sigmoid

cost

1
2
def sigmoid(z):
return 1 / (1 + np.exp(-z))
1
2
3
4
def cost(Theta, X, Y):
part1 = Y * np.log(sigmoid(X @ Theta.T))
part2 = (1 - Y) * np.log(1 - sigmoid(X @ Theta.T))
return - 1 / len(X) * np.sum(part1 + part2)

Gradient Descent

gradient

1
2
def gradient(Theta, X, Y):
return 1 / len(X) * X.T @ (sigmoid(X @ Theta.T) - Y)

Feature Mapping

1
2
3
4
5
6
7
8
9
def feature_mapping(x, y, power, as_ndarray=False):
data = {"f{0}{1}".format(i-p, p): np.power(x, i-p) * np.power(y, p)
for i in range(0, power+1)
for p in range(0, i+1)
}
if as_ndarray:
return pd.DataFrame(data).values
else:
return pd.DataFrame(data)

Regularized Cost

1
2
3
4
5
def cost(Theta, X, Y, l=1):
part1 = Y * np.log(sigmoid(X @ Theta.T))
part2 = (1 - Y) * np.log(1 - sigmoid(X @ Theta.T))
reg = np.power(Theta[1:], 2)
return -1 * np.mean(part1 + part2) + (l / 2 * len(X)) * np.sum(reg)

Regularized Gradient

1
2
3
4
5
def gradient(Theta, X, Y, l=1):
part1 = 1 / len(X) * X.T @ (sigmoid(X @ Theta.T) - Y)
reg = l / len(X) * Theta
reg[0] = 0
return part1 + reg

Neural Network

Feed_Forward

1
2
3
4
5
6
7
8
9
10
11
12
def feed_forward(Thetas, X):
A = []
Z = []
a = X
for t in deserialize(Thetas):
a = np.insert(a, 0, 1, axis=1)
A.append(a)
z = a @ t.T
Z.append(z)
a = sigmoid(z)
A.append(a)
return A, Z

Cost

1
2
3
4
5
6
def cost(Thetas, X, Y):
for t in deserialize(Thetas):
X = np.insert(X, 0, 1, axis=1)
X = X @ t.T
X = sigmoid(X)
return np.mean(np.sum(-Y * np.log(X) - (1 - Y) * np.log(1 - X), axis=1))

Regularized Cost

1
2
3
4
5
6
7
8
9
10
def cost(Thetas, X, Y, l=1):
part2 = 0.0
for t in deserialize(Thetas):
X = np.insert(X, 0, 1, axis=1)
X = X @ t.T
X = sigmoid(X)
t = t[..., 1:]
part2 += (l / (2 * m)) * np.sum(np.power(t, 2))
part1 = np.mean(np.sum(-Y * np.log(X) - (1 - Y) * np.log(1 - X), axis=1))
return part1 + part2

Back Propagation

1
2
def sigmoid_gradient(z):
return sigmoid(z) * (1 - sigmoid(z))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
def back_propagation(Thetas, X, Y, l):
A, Z = feed_forward(Thetas, X)
a1, a2, a3 = A
z2, z3 = Z
theta1, theta2 = deserialize(Thetas)
m = X.shape[0]

d3 = a3 - Y
d2 = d3 @ theta2[..., 1:] * sigmoid_gradient(z2)

theta1[..., 0] = 0
theta2[..., 0] = 0
D2 = (1 / m) * d3.T @ a2 + (l / m) * theta2
D1 = (1 / m) * d2.T @ a1 + (l / m) * theta1
return serialize((D1, D2))

SVM

Gaussian Kernel

gamma和sigma成反比,gamma越小即sigma越大,gaussian kernek越”胖”,模型越容易under fitting

1
2
def gaussian_kernel(X, Y, sigma):
return np.exp(-np.sum(np.power(X - Y, 2)) / (2 * sigma ** 2))

Unsupervised Learning

K-means

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
def K_means(X, K):
'''
@param X(ndarray): all points
@param K(int): the num of clusters
return (idx, centroids_all)(tuple)
idx(ndarray): the cluster(idx) corresbonding to each point
centroids_all([ndarray, ...]) record centroids for every iter
'''

centroids = random_initialization(X, K)
centroids_all = [centroids]
idx = np.zeros((1,))
last_c = -1
now_c = -2
while now_c != last_c:
idx = find_closet_centroids(X, centroids)
last_c = now_c
now_c = cost(X, idx, centroids)
centroids = compute_centroids(X, idx)
centroids_all.append(centroids)
return idx, centroids_all
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
def random_initialization(X, K):
'''
@param X(ndarray): all points
@param K(int): the num of clusters
return centroids of cluster(ndarray)
'''

res = np.zeros((1, X.shape[-1]))
m = X.shape[0]
rl = []
while True:
idx = np.random.randint(0, m)
if idx not in rl:
rl.append(idx)
if len(rl) >= K:
break
for idx in rl:
res = np.concatenate((res, X[idx].reshape(1, -1)), axis=0)
return res[1:]
1
2
3
4
5
6
7
8
9
10
11
def find_closet_centroids(X, centroids):
'''
@param X(ndarray): all points
@param centroids(ndarray): last centroids
return the cluster(idx) corresponding to each point
'''

res = np.zeros((1,))
for x in X:
res = np.append(res, np.argmin(np.sqrt(np.sum((centroids - x) ** 2, axis=1))))
return res[1:]
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def compute_centroids(X, idx):
'''
@param X(ndarray): all points
@param idx(ndarray): the cluster(idx) corresponding to each point
return new centroids
'''
K = int(np.max(idx)) + 1
m = X.shape[0]
n = X.shape[-1]
centroids = np.zeros((K, n))
counts = np.zeros((K, n))
for i in range(m):
centroids[int(idx[i])] += X[i]
counts[int(idx[i])] += 1
centroids = centroids / counts
return centroids
1
2
3
4
5
6
def cost(X, idx, centroids):
c = 0
for i in range(len(X)):
c += np.sum(np.power(X[i] - centroids[int(idx[i])], 2))
c /= len(X)
return c

PCA

1
2
3
4
5
6
7
8
9
def pca(X):
'''
@param X(ndarray:(m, n))
'''
m = X.shape[0]
sigma = X.T @ X / m # (n,m) @ (m,n)
u, s, v = np.linalg.svd(sigma) # u(n,n) s(n,) v(n,n)
return u, s, v

1
2
3
4
5
6
7
8
9
def project_data(X, U, K):
'''
project X n dimension to k dimension
@param X(ndarray:(m,n))
@param U(ndarray:(n,n)): u from svd(sigma)
@param K(int): target dimension
return (ndarray)
'''
return X @ U[..., :K]
1
2
3
4
5
6
7
8
def reconstruct_data(Z, U, K):
'''
reconstruct Z k dimension to n dimension
@param Z(ndarray:(m,K))
@param U(ndarray:(n,n)): u from svd(sigma)
@param K(int): projected dimension
'''
return Z @ U[..., :K].T

Anomaly Detection

1
2
3
4
5
def gaussian_distribution(X):
mu = np.mean(X, axis=0)
sigma2 = np.var(X, axis=0)
p = (1 / np.sqrt(2 * np.pi * sigma2)) * np.exp(-(X - mu) ** 2 / (2 * sigma2))
return np.prod(p, axis=1)

Recommender System

Regularized Cost

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
def cost(params, Y, R, nm, nu, nf, l=0.0):
'''
@param params(ndarray): serialized data
@param Y(ndarray:(nm,nu)): evaluation for each movie from each user
@param R(ndarray): flags which indicate which user evaluate which movie
@param nm(int): num of movies
@param nu(int): num of users
@param nf(int): num of features
@param l(float): penalty parameter
return loss(float)
'''

X, theta = deserialize(params, nm, nu, nf) # X(nm,nf) theta(nu,nf)
part1 = np.sum(((X @ theta.T - Y) ** 2) * R) / 2
part2 = (l / 2) * np.sum(theta ** 2)
part3 = (l / 2) * np.sum(X ** 2)
return part1 + part2 + part3

Regularized Gradient

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def gradient(params, Y, R, nm, nu, nf, l=0.0):
'''
@param params(ndarray): serialized data
@param Y(ndarray:(nm,nu)): evaluation for each movie from each user
@param R(ndarray): flags which indicate which user evaluate which movie
@param nm(int): num of movies
@param nu(int): num of users
@param nf(int): num of features
@param l(float): penalty parameter
return serialized gradient
'''

X, theta = deserialize(params, nm, nu, nf) # X(nm,nf) theta(nu,nf)
g_X = ((X @ theta.T - Y) * R) @ theta + l * X
g_theta = ((X @ theta.T - Y) * R) @ X + l * theta
return serialize(g_X, g_theta)

Trick

feature scaling

1
2
3
4
def feature_scaling(X):
means = np.mean(X, axis=0)
stds = np.stds(X, axis=0, ddof=1)
return (X - means) / stds

num2vec

1
2
3
4
5
6
7
def num2vec(y):
res = []
for i in y:
tmp = np.zeros(10)
tmp[i-1] = 1
res.append(tmp)
return np.array(res)

serialize

1
2
3
4
5
def serialize(Thetas):
res = np.zeros(1)
for t in Thetas:
res = np.concatenate((res, t.ravel()), axis=0)
return res[1:]

Gradient Check

1
2
3
4
5
6
7
8
9
10
11
def gradient_check(Theta, X, Y, l, e= 10 ** -4):
'''
@param Theta: serialized Thetas
'''
res = np.zeros(Theta.shape)
for i in range(len(Theta)):
left = np.array(Theta)
left[i] -= e
right = np.array(Theta)
right[i] += e
res[i] = (cost(right, X, Y, l) - cost(left, X, Y, l)) / (2 * e)

Error Analysis

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
def error_analysis(yp, yt):
'''
compute F1-score
@param yp(ndarray): y_pred
@param yt(ndarray): y_true
return F1-score(float)
'''

tp, fp, tn, fn = 0, 0, 0, 0
for i in range(len(yp)):
if yp[i] == yt[i]:
if yp[i] == 1:
tp += 1
else:
tn += 1
else:
if yp[i] == 1:
fp += 1
else:
fn += 1
precision = tp / (tp + fp) if tp + fp else 0
recall = tp / (tp + fn) if tp + fn else 0
f1 = 2 * precisioni * recall / (precision + recall) if precision + recall else 0
return f1

Reference

ML-homework