「项目练习」线性回归、KNN、KMeans算法
in 学习笔记 with 0 comment

「项目练习」线性回归、KNN、KMeans算法

in 学习笔记 with 0 comment

线性回归

from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 创建数据集
data = np.array([[152,51],[156,53],[160,54],[164,55],
                 [168,57],[172,60],[176,62],[180,65],
                 [184,69],[188,72]])
# X,y分别存放特征向量和标签
X,y = data[:,0].reshape(-1,1), data[:,1]
# 训练集和测试集区分开
# train_size=0.8的意思就是随机提取80%的数据作为训练数据
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.8)

# 实现线性回归算法模型
regr = linear_model.LinearRegression()
# 拟合数据,训练模型
regr.fit(X_train,y_train)
# score得到的返回结果是决定系数R平方值
regr.score(X_train,y_train)
0.9740266860381445
font = {'family':"SimHei",'size':20}
plt.rc('font',**font)
# 画训练数据
plt.scatter(X_train,y_train,color='r')
# 画线条,拟合线
plt.plot(X_train,regr.predict(X_train),color='b')
# 画测试数据
plt.scatter(X_test,y_test,color='black')

plt.xlabel('身高')
plt.ylabel('体重')
plt.show()

output_3_0.png

# 做预测
# 预测身高是163的人,他的体重是多少呢
regr.predict([[160]])
array([54.37804878])

KNN练习

from sklearn.datasets.samples_generator import make_blobs

centers = [[-2,2],[2,2],[0,4]]

x,y = make_blobs(n_samples=60, centers=centers, cluster_std=0.6)
#正常显示负号
plt.rcParams['axes.unicode_minus']=False

plt.figure(figsize=(16,10))
c = np.array(centers)

plt.scatter(x[:,0],x[:,1], c=y, s=100, cmap='cool')

plt.scatter(c[:,0],c[:,1], s= 100, marker="^", c='black')

plt.show()

output_7_0.png

# 使用KNeighborsClassifier 对算法进行训练
from sklearn.neighbors import KNeighborsClassifier
k = 5 
# 模型训练
clf = KNeighborsClassifier(n_neighbors=k)
clf.fit(x,y)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
# 对一个新的样本进行预测
x_sample = [[0,2]]
neighbors = clf.kneighbors(x_sample)
neighbors[1]
array([[ 9, 58,  2, 48, 31]], dtype=int64)
plt.figure(figsize=(16,10), dpi=144)

plt.scatter(x[:,0],x[:,1], c=y, s=100, cmap='cool')
# 中心点画一下
plt.scatter(c[:,0],c[:,1], s= 100, marker="^", c='black')

# 画待预测的点
plt.scatter(x_sample[0][0],x_sample[0][1], marker='x', s=200,cmap='cool')

# 把预测点与距离最近的5个样本连成线
for i in neighbors[1][0]:
    plt.plot([x[i][0], x_sample[0][0]], [x[i][1],x_sample[0][1]], 'k--', linewidth=0.6)

plt.show()

output_10_0.png

KNN练习(鸢尾花)

# 导入鸢尾花数据集
from sklearn.datasets import load_iris
iris = load_iris()
iris_data = iris.data
iris_data
# 萼片长度、萼片宽度、花瓣长度、花瓣宽度
array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.2],
       [5. , 3.2, 1.2, 0.2],
       [5.5, 3.5, 1.3, 0.2],
       [4.9, 3.6, 1.4, 0.1],
       [4.4, 3. , 1.3, 0.2],
       [5.1, 3.4, 1.5, 0.2],
       [5. , 3.5, 1.3, 0.3],
       [4.5, 2.3, 1.3, 0.3],
       [4.4, 3.2, 1.3, 0.2],
       [5. , 3.5, 1.6, 0.6],
       [5.1, 3.8, 1.9, 0.4],
       [4.8, 3. , 1.4, 0.3],
       [5.1, 3.8, 1.6, 0.2],
       [4.6, 3.2, 1.4, 0.2],
       [5.3, 3.7, 1.5, 0.2],
       [5. , 3.3, 1.4, 0.2],
       [7. , 3.2, 4.7, 1.4],
       [6.4, 3.2, 4.5, 1.5],
       [6.9, 3.1, 4.9, 1.5],
       [5.5, 2.3, 4. , 1.3],
       [6.5, 2.8, 4.6, 1.5],
       [5.7, 2.8, 4.5, 1.3],
       [6.3, 3.3, 4.7, 1.6],
       [4.9, 2.4, 3.3, 1. ],
       [6.6, 2.9, 4.6, 1.3],
       [5.2, 2.7, 3.9, 1.4],
       [5. , 2. , 3.5, 1. ],
       [5.9, 3. , 4.2, 1.5],
       [6. , 2.2, 4. , 1. ],
       [6.1, 2.9, 4.7, 1.4],
       [5.6, 2.9, 3.6, 1.3],
       [6.7, 3.1, 4.4, 1.4],
       [5.6, 3. , 4.5, 1.5],
       [5.8, 2.7, 4.1, 1. ],
       [6.2, 2.2, 4.5, 1.5],
       [5.6, 2.5, 3.9, 1.1],
       [5.9, 3.2, 4.8, 1.8],
       [6.1, 2.8, 4. , 1.3],
       [6.3, 2.5, 4.9, 1.5],
       [6.1, 2.8, 4.7, 1.2],
       [6.4, 2.9, 4.3, 1.3],
       [6.6, 3. , 4.4, 1.4],
       [6.8, 2.8, 4.8, 1.4],
       [6.7, 3. , 5. , 1.7],
       [6. , 2.9, 4.5, 1.5],
       [5.7, 2.6, 3.5, 1. ],
       [5.5, 2.4, 3.8, 1.1],
       [5.5, 2.4, 3.7, 1. ],
       [5.8, 2.7, 3.9, 1.2],
       [6. , 2.7, 5.1, 1.6],
       [5.4, 3. , 4.5, 1.5],
       [6. , 3.4, 4.5, 1.6],
       [6.7, 3.1, 4.7, 1.5],
       [6.3, 2.3, 4.4, 1.3],
       [5.6, 3. , 4.1, 1.3],
       [5.5, 2.5, 4. , 1.3],
       [5.5, 2.6, 4.4, 1.2],
       [6.1, 3. , 4.6, 1.4],
       [5.8, 2.6, 4. , 1.2],
       [5. , 2.3, 3.3, 1. ],
       [5.6, 2.7, 4.2, 1.3],
       [5.7, 3. , 4.2, 1.2],
       [5.7, 2.9, 4.2, 1.3],
       [6.2, 2.9, 4.3, 1.3],
       [5.1, 2.5, 3. , 1.1],
       [5.7, 2.8, 4.1, 1.3],
       [6.3, 3.3, 6. , 2.5],
       [5.8, 2.7, 5.1, 1.9],
       [7.1, 3. , 5.9, 2.1],
       [6.3, 2.9, 5.6, 1.8],
       [6.5, 3. , 5.8, 2.2],
       [7.6, 3. , 6.6, 2.1],
       [4.9, 2.5, 4.5, 1.7],
       [7.3, 2.9, 6.3, 1.8],
       [6.7, 2.5, 5.8, 1.8],
       [7.2, 3.6, 6.1, 2.5],
       [6.5, 3.2, 5.1, 2. ],
       [6.4, 2.7, 5.3, 1.9],
       [6.8, 3. , 5.5, 2.1],
       [5.7, 2.5, 5. , 2. ],
       [5.8, 2.8, 5.1, 2.4],
       [6.4, 3.2, 5.3, 2.3],
       [6.5, 3. , 5.5, 1.8],
       [7.7, 3.8, 6.7, 2.2],
       [7.7, 2.6, 6.9, 2.3],
       [6. , 2.2, 5. , 1.5],
       [6.9, 3.2, 5.7, 2.3],
       [5.6, 2.8, 4.9, 2. ],
       [7.7, 2.8, 6.7, 2. ],
       [6.3, 2.7, 4.9, 1.8],
       [6.7, 3.3, 5.7, 2.1],
       [7.2, 3.2, 6. , 1.8],
       [6.2, 2.8, 4.8, 1.8],
       [6.1, 3. , 4.9, 1.8],
       [6.4, 2.8, 5.6, 2.1],
       [7.2, 3. , 5.8, 1.6],
       [7.4, 2.8, 6.1, 1.9],
       [7.9, 3.8, 6.4, 2. ],
       [6.4, 2.8, 5.6, 2.2],
       [6.3, 2.8, 5.1, 1.5],
       [6.1, 2.6, 5.6, 1.4],
       [7.7, 3. , 6.1, 2.3],
       [6.3, 3.4, 5.6, 2.4],
       [6.4, 3.1, 5.5, 1.8],
       [6. , 3. , 4.8, 1.8],
       [6.9, 3.1, 5.4, 2.1],
       [6.7, 3.1, 5.6, 2.4],
       [6.9, 3.1, 5.1, 2.3],
       [5.8, 2.7, 5.1, 1.9],
       [6.8, 3.2, 5.9, 2.3],
       [6.7, 3.3, 5.7, 2.5],
       [6.7, 3. , 5.2, 2.3],
       [6.3, 2.5, 5. , 1.9],
       [6.5, 3. , 5.2, 2. ],
       [6.2, 3.4, 5.4, 2.3],
       [5.9, 3. , 5.1, 1.8]])
# 获取样本标记值
iris_target = iris.target
# 划分训练集和测试集
x_train,x_test,y_train,y_test = train_test_split(iris_data,iris_target,test_size=0.25)
# 训练
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train,y_train)
y_predict = knn.predict(x_test)
# 预测结果
labels=['山鸢尾','虹膜锦葵','变色鸢尾']
for i in range(len(y_predict)):
    print("第%d次测试:真实值是%s,预测值是%s" % ((i+1),labels[y_predict[i]],labels[y_test[i]]))
第1次测试:真实值是山鸢尾,预测值是山鸢尾
第2次测试:真实值是山鸢尾,预测值是山鸢尾
第3次测试:真实值是山鸢尾,预测值是山鸢尾
第4次测试:真实值是变色鸢尾,预测值是变色鸢尾
第5次测试:真实值是变色鸢尾,预测值是变色鸢尾
第6次测试:真实值是虹膜锦葵,预测值是虹膜锦葵
第7次测试:真实值是虹膜锦葵,预测值是虹膜锦葵
第8次测试:真实值是山鸢尾,预测值是山鸢尾
第9次测试:真实值是虹膜锦葵,预测值是虹膜锦葵
第10次测试:真实值是山鸢尾,预测值是山鸢尾
第11次测试:真实值是变色鸢尾,预测值是变色鸢尾
第12次测试:真实值是变色鸢尾,预测值是变色鸢尾
第13次测试:真实值是山鸢尾,预测值是山鸢尾
第14次测试:真实值是山鸢尾,预测值是山鸢尾
第15次测试:真实值是虹膜锦葵,预测值是虹膜锦葵
第16次测试:真实值是变色鸢尾,预测值是变色鸢尾
第17次测试:真实值是变色鸢尾,预测值是变色鸢尾
第18次测试:真实值是山鸢尾,预测值是山鸢尾
第19次测试:真实值是变色鸢尾,预测值是变色鸢尾
第20次测试:真实值是变色鸢尾,预测值是变色鸢尾
第21次测试:真实值是虹膜锦葵,预测值是虹膜锦葵
第22次测试:真实值是变色鸢尾,预测值是变色鸢尾
第23次测试:真实值是山鸢尾,预测值是山鸢尾
第24次测试:真实值是虹膜锦葵,预测值是虹膜锦葵
第25次测试:真实值是变色鸢尾,预测值是变色鸢尾
第26次测试:真实值是变色鸢尾,预测值是变色鸢尾
第27次测试:真实值是山鸢尾,预测值是山鸢尾
第28次测试:真实值是虹膜锦葵,预测值是变色鸢尾
第29次测试:真实值是山鸢尾,预测值是山鸢尾
第30次测试:真实值是虹膜锦葵,预测值是虹膜锦葵
第31次测试:真实值是虹膜锦葵,预测值是虹膜锦葵
第32次测试:真实值是变色鸢尾,预测值是变色鸢尾
第33次测试:真实值是山鸢尾,预测值是山鸢尾
第34次测试:真实值是虹膜锦葵,预测值是虹膜锦葵
第35次测试:真实值是变色鸢尾,预测值是变色鸢尾
第36次测试:真实值是变色鸢尾,预测值是变色鸢尾
第37次测试:真实值是山鸢尾,预测值是山鸢尾
第38次测试:真实值是变色鸢尾,预测值是变色鸢尾
knn.score(x_test,y_test)
0.9736842105263158
## 寻找最佳K值
from sklearn.model_selection import cross_val_score
k_range = range(1,30)
k_error = []
x = iris.data
y = iris.target
# 循环取 看误差效果
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    # cv参数也是划分训练集和测试集
    scores = cross_val_score(knn,x,y,cv=6)
    k_error.append(1-scores.mean())
plt.plot(k_range,k_error)
plt.xlabel('k的值')
plt.ylabel('错误')
plt.show()

output_18_0.png

KMeans 聚类分析

from sklearn.cluster import KMeans

df = pd.read_csv("data.csv")
df.head()
CustomerID Gender Age Annual Income (k$) Spending Score (1-100)
0 1 Male 19 15 39
1 2 Male 21 15 81
2 3 Female 20 16 6
3 4 Female 23 16 77
4 5 Female 31 17 40
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Gender                  200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB
df.isnull().sum()
CustomerID                0
Gender                    0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64
df.describe()
CustomerID Age Annual Income (k$) Spending Score (1-100)
count 200.000000 200.000000 200.000000 200.000000
mean 100.500000 38.850000 60.560000 50.200000
std 57.879185 13.969007 26.264721 25.823522
min 1.000000 18.000000 15.000000 1.000000
25% 50.750000 28.750000 41.500000 34.750000
50% 100.500000 36.000000 61.500000 50.000000
75% 150.250000 49.000000 78.000000 73.000000
max 200.000000 70.000000 137.000000 99.000000
# 把列改成中文
df.columns=['用户ID',"性别","年龄","年收入","支出"]
df.head()
用户ID 性别 年龄 年收入 支出
0 1 Male 19 15 39
1 2 Male 21 15 81
2 3 Female 20 16 6
3 4 Female 23 16 77
4 5 Female 31 17 40

查看数据分布

fig = plt.figure(figsize=(20,5))
fig.suptitle('各指标数据分布')

# 画第一个图,年龄分布
ax = fig.add_subplot(221)
ax.hist(df['年龄'])
ax.title.set_text('年龄分布')

# 画第二个图,查看性别比例
ax = fig.add_subplot(222)
ax.bar(['男','女'],height=[(df['性别']=='Male').sum(),(df['性别']=='Female').sum()])
ax.title.set_text("性别")

# 画第三个图,年收入
ax = fig.add_subplot(223)
ax.hist(df['年收入'])
ax.title.set_text("年收入")


# 画第4个图,支出
ax = fig.add_subplot(224)
ax.hist(df['支出'])
ax.title.set_text("支出")

# 四个图距离设置
fig.subplots_adjust(wspace=0.3,hspace=0.5)

output_26_0.png

年龄与年收入之间的关系

plt.figure(figsize=(12,6))
for gender in ['Male','Female']:
    plt.scatter(x='年龄',y='年收入', data=df[df['性别']==gender],s=200,alpha=0.5,label=gender)

plt.xlabel('年龄')
plt.ylabel('年收入')
plt.legend()
plt.show()

output_28_0.png

查看各个年龄段的支出情况(年龄和支出之间的关系)

plt.figure(figsize=(12,6))
for gender in ['Male','Female']:
    plt.scatter(x='年龄',y='支出', data=df[df['性别']==gender],s=200,alpha=0.5,label=gender)

plt.xlabel('年龄')
plt.ylabel('支出')
plt.legend()
plt.show()

output_30_0.png

收入与支出的关系

plt.figure(figsize=(12,6))
for gender in ['Male','Female']:
    plt.scatter(x='年收入',y='支出', data=df[df['性别']==gender],s=200,alpha=0.5,label=gender)

plt.xlabel('年收入')
plt.ylabel('支出')
plt.legend()
plt.show()

output_32_0.png

使用KMeans聚类分析

## 寻找最佳K值
x1 = df[['年龄','支出']].values
inertia = []
for i in range(1,11):
    km = KMeans(n_clusters=i)
    km.fit(x1)
    inertia.append(km.inertia_)  #簇内的误差平方和
plt.figure(figsize=(12,6))
plt.plot(range(1,11),inertia)

plt.title("寻找最佳K值")
plt.xlabel('簇的数量')
plt.ylabel('簇内误差平方和')
plt.show()

output_34_0.png

km = KMeans(n_clusters=4)
y_means = km.fit_predict(x1)
y_means
array([2, 1, 0, 1, 2, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 2, 0, 1, 2, 1,
       0, 1, 0, 1, 0, 2, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 3, 1, 3, 2,
       0, 2, 3, 2, 2, 2, 3, 2, 2, 3, 3, 3, 3, 3, 2, 3, 3, 2, 3, 3, 3, 2,
       3, 3, 2, 2, 3, 3, 3, 3, 3, 2, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, 3, 2,
       2, 3, 3, 2, 3, 2, 2, 2, 3, 2, 3, 2, 2, 3, 3, 2, 3, 2, 3, 3, 3, 3,
       3, 2, 2, 2, 2, 2, 3, 3, 3, 3, 2, 2, 2, 1, 2, 1, 3, 1, 0, 1, 0, 1,
       2, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 1, 0, 1, 3, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 3, 1, 0, 1, 0, 1, 0, 1, 0, 2, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 2, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1])
plt.figure(figsize=(12,6))
plt.scatter(x1[y_means==0,0],x1[y_means==0,1],s=200 )
plt.scatter(x1[y_means==1,0],x1[y_means==1,1],s=200 )
plt.scatter(x1[y_means==2,0],x1[y_means==2,1],s=200 )
plt.scatter(x1[y_means==3,0],x1[y_means==3,1],s=200 )
plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:,1],s=100, c='black', label='中心点')
plt.xlabel('年龄')
plt.ylabel('支出')
plt.legend()
plt.show()

output_36_0.png

根据收入和支出来看其关系

## 寻找最佳K值
x1 = df[['年收入','支出']].values
inertia = []
for i in range(1,11):
    km = KMeans(n_clusters=i)
    km.fit(x1)
    inertia.append(km.inertia_)  #簇内的误差平方和
plt.figure(figsize=(12,6))
plt.plot(range(1,11),inertia)

plt.title("寻找最佳K值")
plt.xlabel('簇的数量')
plt.ylabel('簇内误差平方和')
plt.show()

output_38_0.png

km = KMeans(n_clusters=5)
y_means = km.fit_predict(x1)

plt.figure(figsize=(12,6))
plt.scatter(x1[y_means==0,0],x1[y_means==0,1],s=200 )
plt.scatter(x1[y_means==1,0],x1[y_means==1,1],s=200 )
plt.scatter(x1[y_means==2,0],x1[y_means==2,1],s=200 )
plt.scatter(x1[y_means==3,0],x1[y_means==3,1],s=200 )
plt.scatter(x1[y_means==4,0],x1[y_means==4,1],s=200 )
plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:,1],s=100, c='black', label='中心点')
plt.xlabel('年收入')
plt.ylabel('支出')
plt.legend()
plt.show()

output_39_0.png

尝试3个特征聚类分析

x=df[['年龄' , '年收入' ,'支出']].values
inertia=[]
for i in range(1,11):  
    km=KMeans(n_clusters=i)
    km.fit(x)
    inertia.append(km.inertia_)
plt.figure(1,figsize=(12,6))    
plt.plot(range(1,11),inertia)
plt.title('寻找最佳K值',fontsize=20)
plt.xlabel('簇的数量')
plt.ylabel('簇内误差平方和')
plt.show()

output_41_0.png

algorithm = (KMeans(n_clusters = 6 ,max_iter=300, tol=0.0001,  random_state= 100, algorithm='elkan') )
algorithm.fit(x)
labels = algorithm.labels_
centroids = algorithm.cluster_centers_
import plotly as py
import plotly.graph_objects as go

df['label'] = labels
trace1 = go.Scatter3d(
    x= df['年龄'],
    y= df['年收入'],
    z= df['支出'],
    mode='markers',
     marker=dict(
        color = df['label'], 
        size= 15,
        line=dict(
            color= df['label'],
            width= 10
        ),
        opacity=0.8
     )
)
data = [trace1]
layout = go.Layout(
    height=800,
    width=800,
    title= '聚类分析',
    scene = dict(
            xaxis = dict(title  = '年龄'),
            yaxis = dict(title  = '年收入'),
            zaxis = dict(title  = '支出')
        )
)
fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig)

newplot.png