gpt4 book ai didi

c - 逻辑回归代码在 ~43,500 个生成的观察值以上停止工作

转载 作者:行者123 更新时间:2023-12-05 08:35:28 26 4
gpt4 key购买 nike

在对我用 C 语言编写的执行逻辑回归的代码进行故障排除时遇到了一些困难。虽然它似乎适用于较小的半随机数据集,但它在我通过 43,500 个观察值(通过调整创建的观察值数量确定)附近停止工作(例如,分配属于第 1 类的适当概率)。创建 150代码中使用的功能,我确实创建了前两个作为观察次数的函数,所以我不确定这是否是这里的问题,尽管我使用的是 double 。代码中的某处可能有溢出?

下面的代码应该是独立的;它生成 m=50,000 个观察值和 n=150 个特征。将 m 设置为低于 43,500 应该返回“Percent class 1: 0.250000”,设置为 44,000 或更高将返回“Percent class 1: 0.000000”,无论 max_iter(我们对 m 个观测值进行采样的次数)设置为多少。

第一个特征设置为 1.0 除以观察总数,如果是 0 类(观察的前 75%),否则设置为观察索引除以观察总数。

第二个特征只是指数除以观察总数。

所有其他特征都是随机的。

逻辑回归旨在使用随机梯度下降,随机选择一个观察指标,使用当前权重计算损失与预测 y 的梯度,并使用梯度和学习率 (eta) 更新权重。

使用与 Python 和 NumPy 相同的初始化,我仍然得到正确的结果,甚至超过 50,000 个观察值。

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <time.h>

// Compute z = w * x + b
double dlc( int n, double *X, double *coef, double intercept )
{
double y_pred = intercept;
for (int i = 0; i < n; i++)
{
y_pred += X[i] * coef[i];
}
return y_pred;
}

// Compute y_hat = 1 / (1 + e^(-z))
double sigmoid( int n, double alpha, double *X, double *coef, double beta, double intercept )
{
double y_pred;
y_pred = dlc(n, X, coef, intercept);
y_pred = 1.0 / (1.0 + exp(-y_pred));

return y_pred;
}

// Stochastic gradient descent
void sgd( int m, int n, double *X, double *y, double *coef, double *intercept, double eta, int max_iter, int fit_intercept, int random_seed )
{
double *gradient_coef, *X_i;
double y_i, y_pred, resid;
int idx;

double gradient_intercept = 0.0, alpha = 1.0, beta = 1.0;

X_i = (double *) malloc (n * sizeof(double));
gradient_coef = (double *) malloc (n * sizeof(double));

for ( int i = 0; i < n; i++ )
{
coef[i] = 0.0;
gradient_coef[i] = 0.0;
}
*intercept = 0.0;

srand(random_seed);

for ( int epoch = 0; epoch < max_iter; epoch++ )
{
for ( int run = 0; run < m; run++ )
{
// Randomly sample an observation
idx = rand() % m;
for ( int i = 0; i < n; i++ )
{
X_i[i] = X[n*idx+i];
}
y_i = y[idx];
// Compute y_hat
y_pred = sigmoid( n, alpha, X_i, coef, beta, *intercept );
resid = -(y_i - y_pred);
// Compute gradients and adjust weights
for (int i = 0; i < n; i++)
{
gradient_coef[i] = X_i[i] * resid;
coef[i] -= eta * gradient_coef[i];
}
if ( fit_intercept == 1 )
{
*intercept -= eta * resid;
}
}
}
}

int main(void)
{
double *X, *y, *coef, *y_pred;
double intercept;
double eta = 0.05;
double alpha = 1.0, beta = 1.0;
long m = 50000;
long n = 150;
int max_iter = 20;

long class_0 = (long)(3.0 / 4.0 * (double)m);
double pct_class_1 = 0.0;

clock_t test_start;
clock_t test_end;
double test_time;

printf("Constructing variables...\n");
X = (double *) malloc (m * n * sizeof(double));
y = (double *) malloc (m * sizeof(double));
y_pred = (double *) malloc (m * sizeof(double));
coef = (double *) malloc (n * sizeof(double));

// Initialize classes
for (int i = 0; i < m; i++)
{
if (i < class_0)
{
y[i] = 0.0;
}
else
{
y[i] = 1.0;
}
}

// Initialize observation features
for (int i = 0; i < m; i++)
{
if (i < class_0)
{
X[n*i] = 1.0 / (double)m;
}
else
{
X[n*i] = (double)i / (double)m;
}
X[n*i + 1] = (double)i / (double)m;
for (int j = 2; j < n; j++)
{
X[n*i + j] = (double)(rand() % 100) / 100.0;
}
}

// Fit weights
printf("Running SGD...\n");
test_start = clock();
sgd( m, n, X, y, coef, &intercept, eta, max_iter, 1, 42 );
test_end = clock();
test_time = (double)(test_end - test_start) / CLOCKS_PER_SEC;
printf("Time taken: %f\n", test_time);

// Compute y_hat and share of observations predicted as class 1
printf("Making predictions...\n");
for ( int i = 0; i < m; i++ )
{
y_pred[i] = sigmoid( n, alpha, &X[i*n], coef, beta, intercept );
}

printf("Printing results...\n");
for ( int i = 0; i < m; i++ )
{
//printf("%f\n", y_pred[i]);
if (y_pred[i] > 0.5)
{
pct_class_1 += 1.0;
}
// Troubleshooting print
if (i < 10 || i > m - 10)
{
printf("%g\n", y_pred[i]);
}
}
printf("Percent class 1: %f", pct_class_1 / (double)m);

return 0;
}

作为引用,这里是我的(大概)等效的 Python 代码,它在超过 50,000 次观察时返回已识别类的正确百分比:

import numpy as np
import time

def sigmoid(x):
return 1 / (1 + np.exp(-x))


class LogisticRegressor:
def __init__(self, eta, init_runs, fit_intercept=True):
self.eta = eta
self.init_runs = init_runs
self.fit_intercept = fit_intercept

def fit(self, x, y):
m, n = x.shape
self.coef = np.zeros((n, 1))
self.intercept = np.zeros((1, 1))

for epoch in range(self.init_runs):
for run in range(m):
idx = np.random.randint(0, m)
x_i = x[idx:idx+1, :]
y_i = y[idx]
y_pred_i = sigmoid(x_i.dot(self.coef) + self.intercept)
gradient_w = -(x_i.T * (y_i - y_pred_i))
self.coef -= self.eta * gradient_w
if self.fit_intercept:
gradient_b = -(y_i - y_pred_i)
self.intercept -= self.eta * gradient_b

def predict_proba(self, x):
m, n = x.shape
y_pred = np.ones((m, 2))
y_pred[:,1:2] = sigmoid(x.dot(self.coef) + self.intercept)
y_pred[:,0:1] -= y_pred[:,1:2]
return y_pred

def predict(self, x):
return np.round(sigmoid(x.dot(self.coef) + self.intercept))


m = 50000
n = 150
class1 = int(3.0 / 4.0 * m)

X = np.random.rand(m, n)
y = np.zeros((m, 1))

for obs in range(m):
if obs < class1:
continue
else:
y[obs,0] = 1

for obs in range(m):
if obs < class1:
X[obs, 0] = 1.0 / float(m)
else:
X[obs, 0] = float(obs) / float(m)
X[obs, 1] = float(obs) / float(m)

logit = LogisticRegressor(0.05, 20)
start_time = time.time()
logit.fit(X, y)
end_time = time.time()
print(round(end_time - start_time, 2))
y_pred = logit.predict(X)
print("Percent:", y_pred.sum() / len(y_pred))

最佳答案

问题在这里:

            // Randomly sample an observation
idx = rand() % m;

...鉴于 OP 的 RAND_MAX 是 32767。所有 0 类观察都在末尾这一事实加剧了这一点。

所有样本将从前 32768 个观测值中抽取,当总观测值大于此值时,0 类观测值在可采样值中的比例小于 0.25。在总共 43691 个观测值中,没有可抽样的 0 类观测值。

作为次要问题,如果 m 不能均分 RAND_MAX + 1,则 rand() % m 不会产生完全均匀的分布, 虽然这个问题的影响会更加微妙。


底线:您需要更好的随机数生成器。

至少,您可以考虑将来自对 rand() 的两次调用的位组合起来以生成具有足够范围的整数,但您可能需要考虑获取第三方生成器。有几个可用的。

关于c - 逻辑回归代码在 ~43,500 个生成的观察值以上停止工作,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/75299046/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com