作者热门文章
- c - 在位数组中找到第一个零
- linux - Unix 显示有关匹配两种模式之一的文件的信息
- 正则表达式替换多个文件
- linux - 隐藏来自 xtrace 的命令
我目前正在高维情况下用scikit试验Lasso。标签是 Y_i(实数),特征是 X_i(X_i 是大小为 d=112 的向量)。我只有三对 (Y_i,X_i)。
d>>n=3 所以我们处于高维情况。
import numpy as np
Y = np.array([ 0.24186978, 0.20693342, 0.00441244])
X0 = np.array([ 0.49019359, -0.11332346, 0.46826879, -0.13540658, 0.37022392, -0.23379722, 0.37143564, -0.2329437 , 0.37291492, -0.23186138, 0.37469679, -0.23055168, 0.30316716, -0.29125359, 0.30840626, -0.28652415, 0.44230139, -0.16121566, 0.42683712, -0.17683825, 0.32256713, -0.28145402, 0.3280964 , -0.27628293, 0.33245644, -0.27231986, 0.33670266, -0.26854582, 0.2643481 , -0.33007265, 0.27145917, -0.32347124, 0.3864629 , -0.21705415, 0.3808803 , -0.22279507, 0.27458751, -0.32943364, 0.28447461, -0.31990473, 0.2917428 , -0.3130335 , 0.29848329, -0.30676519, 0.22697144, -0.36744932, 0.2357466 , -0.35918381, 0.32553467, -0.27798238, 0.33200664, -0.27166872, 0.22802673, -0.37599441, 0.24186978, -0.36250956, 0.25182545, -0.35295084, 0.26090483, -0.34434365, 0.19180827, -0.40261249, 0.20193396, -0.39299645, 0.26323078, -0.34028627, 0.28211954, -0.32155583, 0.18444715, -0.419574 , 0.20146085, -0.40291849, 0.21366417, -0.39111212, 0.2247606 , -0.38048788, 0.15946525, -0.43495551, 0.17055441, -0.424376 , 0.20348854, -0.40002851, 0.23321321, -0.37046216, 0.14509726, -0.45892388, 0.16422526, -0.44015407, 0.17807138, -0.42670492, 0.1907319 , -0.41451658, 0.13036714, -0.46405362, 0.14199556, -0.45293485, 0.14977732, -0.45373973, 0.18715638, -0.41651899, 0.11082473, -0.49319641, 0.13088375, -0.47349559, 0.145673 , -0.45910329, 0.15936004, -0.44588844, 0.10475443, -0.48966633, 0.11649699, -0.47843342])
X1 = np.array([ 0.08172583, 0.08172583, 0.12787895, 0.12787895, 0.17680895, 0.17680895, 0.20428698, 0.20428698, 0.22810783, 0.22810783, 0.24952302, 0.24952302, 0.25443032, 0.25443032, 0.27212382, 0.27212382, 0.09939284, 0.09939284, 0.14649492, 0.14649492, 0.18353275, 0.18353275, 0.21186616, 0.21186616, 0.23646753, 0.23646753, 0.25859485, 0.25859485, 0.25241207, 0.25241207, 0.27111512, 0.27111512, 0.11277054, 0.11277054, 0.16042754, 0.16042754, 0.18318121, 0.18318121, 0.21269144, 0.21269144, 0.23825706, 0.23825706, 0.26132525, 0.26132525, 0.24416304, 0.24416304, 0.26402983, 0.26402983, 0.11961642, 0.11961642, 0.16822144, 0.16822144, 0.17599107, 0.17599107, 0.20693342, 0.20693342, 0.23361131, 0.23361131, 0.25782472, 0.25782472, 0.23053159, 0.23053159, 0.2516101 , 0.2516101 , 0.11876227, 0.11876227, 0.16908658, 0.16908658, 0.16286772, 0.16286772, 0.19528754, 0.19528754, 0.22310772, 0.22310772, 0.24857796, 0.24857796, 0.21262181, 0.21262181, 0.23482641, 0.23482641, 0.11042389, 0.11042389, 0.16301827, 0.16301827, 0.14522374, 0.14522374, 0.17886349, 0.17886349, 0.20768069, 0.20768069, 0.23437567, 0.23437567, 0.19167763, 0.19167763, 0.21478313, 0.21478313, 0.09612585, 0.09612585, 0.15078275, 0.15078275, 0.1247584 , 0.1247584 , 0.15903691, 0.15903691, 0.18850909, 0.18850909, 0.21622738, 0.21622738, 0.16897004, 0.16897004, 0.1926264 , 0.1926264 ])
X2 = np.array([ 0.0039031 , 0.0039031 , 0.00346908, 0.00346908, 0.00450824, 0.00450824, 0.00409751, 0.00409751, 0.0038224 , 0.0038224 , 0.00358683, 0.00358683, 0.00393648, 0.00393648, 0.00374151, 0.00374151, 0.00488007, 0.00488007, 0.0040774 , 0.0040774 , 0.00478876, 0.00478876, 0.00434275, 0.00434275, 0.0040458 , 0.0040458 , 0.00379218, 0.00379218, 0.00397968, 0.00397968, 0.00379608, 0.00379608, 0.00568263, 0.00568263, 0.00457514, 0.00457514, 0.00488406, 0.00488406, 0.00444946, 0.00444946, 0.00415691, 0.00415691, 0.00390482, 0.00390482, 0.00391778, 0.00391778, 0.00375997, 0.00375997, 0.00617576, 0.00617576, 0.00490909, 0.00490909, 0.00478816, 0.00478816, 0.00441244, 0.00441244, 0.00415124, 0.00415124, 0.00392093, 0.00392093, 0.00375961, 0.00375961, 0.00363975, 0.00363975, 0.00627155, 0.00627155, 0.00504258, 0.00504258, 0.00451513, 0.00451513, 0.00423891, 0.00423891, 0.00403303, 0.00403303, 0.00384307, 0.00384307, 0.0035197 , 0.0035197 , 0.00344643, 0.00344643, 0.00595365, 0.00595365, 0.00496165, 0.00496165, 0.00409633, 0.00409633, 0.003947 , 0.003947 , 0.00381432, 0.00381432, 0.00367948, 0.00367948, 0.00321652, 0.00321652, 0.00319428, 0.00319428, 0.0052817 , 0.0052817 , 0.00467728, 0.00467728, 0.00357511, 0.00357511, 0.00356312, 0.00356312, 0.00351338, 0.00351338, 0.0034431 , 0.0034431 , 0.00287055, 0.00287055, 0.00289938, 0.00289938])
X = np.array([X0,X1,X2])
数据使得问题 Y = X.theta 的解存在,其中 theta 是一个 d 维向量,所有 0 和索引 54 处的一个 1:
>>> Y
array([ 0.24186978, 0.20693342, 0.00441244])
>>> X[0, 54]
0.24186978045754323
>>> X[1, 54]
0.20693341629897405
>>> X[2, 54]
0.0044124449820170455
但是,当我应用 Lasso 时,它不是预期的结果......:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.1)
res = lasso.fit(X,Y)
给予:
>>> res.coef_.tolist()
[0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, 0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, 0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0]
通过改变惩罚系数:
lasso = Lasso(alpha=0.01)
res = lasso.fit(X,Y)
结果还是错误:
>>> res.coef_.tolist()
[0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.24488850166974235, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0]
如何检索预期的系数向量?
最佳答案
Lasso 不求解 l0
惩罚最小二乘法,而是求解 l1
惩罚最小二乘法。您为 alpha=0.01
获得的解决方案是 Lasso 解决方案(特征 #10 的单个非零系数约为 0.245)。
即使您的解决方案具有 0.0
的平方重建误差,它仍然有 1.0
的惩罚(乘以 alpha)。
alpha=1.0
的 lasso 解有一个小的平方重建误差 0.04387
(除以 2 * n_samples == 6
)和更小的 l1 惩罚 0.245
(乘以 alpha)。
文档字符串中给出了 lasso 最小化的目标函数:
总结常用于正则化最小二乘回归的不同先验(或惩罚):
l2
惩罚有利于任意数量的非零系数,但绝对值非常小(接近于零)l1
惩罚有利于少数绝对值较小的非零系数。l0
支持任何绝对值的少量非零系数。l0
是非凸的,通常不如 l1
和 l2
容易优化。这就是为什么人们在实践中使用 l1
(lasso) 或 l1 + l2
(elastic net) 来寻找稀疏解决方案,即使不如 l0
.
关于python - Lasso 回归,所有系数均为 0,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/25989720/
我是一名优秀的程序员,十分优秀!