// Multiple linear regression // Replicates Excel // Martin Sewell // martin@martinsewell.com // 27 July 2020 // Install GSL, the GNU Scientific Library, by following the instructions given in the link below. // https://solarianprogrammer.com/2020/01/26/getting-started-gsl-gnu-scientific-library-windows-macos-linux/ #include #include #include #include #include #include #include #include #include int main() { std::string inputfilename; inputfilename = "input.txt"; // Can be space, tab or comma separated. No headers. First column must be y, the dependent variable. Other columns are x1, x2, x3, etc. std::ifstream inputfile; inputfile.open(inputfilename); if (inputfile.fail()) { std::cout << "Failed to open " << inputfilename << std::endl; std::cout << "Press Enter"; std::cin.get(); return 0; } std::vector> datarc; std::string str; std::istringstream data_s; double d; long int row = 0; int col = 0; while (!inputfile.eof()) { while (getline(inputfile, str)) { if (!str.empty()) { datarc.push_back(std::vector()); std::replace(str.begin(), str.end(), ',', ' '); std::istringstream data_s(str); while (data_s >> d) { datarc[row].push_back(d); if (row == 0) col++; } row++; } } } long int numrows = row; int numcols = col; std::vector< std::vector > datacr(numcols, std::vector(numrows)); for (int i = 0; i < numcols; ++i) for (int j = 0; j < numrows; ++j) datacr[i][j] = datarc[j][i]; long int N = numrows; // number of data points int P = numcols; // number of independent variables = P-1 gsl_vector* y; gsl_matrix* X; gsl_vector* c; // coefficients gsl_matrix* cov; // allocate space for the matrices and vectors X = gsl_matrix_alloc(N, P); // input y = gsl_vector_alloc(N); // input c = gsl_vector_alloc(P); // output cov = gsl_matrix_alloc(P, P); // output // put the data into the X matrix, row by row for (int r = 0; r < N; ++r) { gsl_matrix_set(X, r, 0, 1.0); for (int c = 1; c < P; ++c) gsl_matrix_set(X, r, c, datarc[r][c]); } // fill vector of observed data for (int r = 0; r < N; ++r) { gsl_vector_set(y, r, datacr[0][r]); } // allocate temporary work space for gsl gsl_multifit_linear_workspace* work; work = gsl_multifit_linear_alloc(N, P); // fit model double chisq; gsl_multifit_linear(X, y, c, cov, &chisq, work); std::vector coefficients(numcols); std::vector standarderror(numcols); std::vector tstat(numcols); std::vector pvalue(numcols); std::vector Lower95(numcols); std::vector Upper95(numcols); gsl_vector_view stderror = gsl_matrix_diagonal(cov); gsl_vector* z = &stderror.vector; int Regressiondf = P - 1; long int Residualdf = N - P; long int Totaldf = N - 1; double ResidualSS = chisq; double TotalSS = gsl_stats_tss(y->data, y->stride, y->size); double RegressionSS = TotalSS - ResidualSS; double ResidualMS = chisq / Residualdf; double RegressionMS = RegressionSS / Regressiondf; for (int j = 0; j < numcols; j++) { coefficients[j] = gsl_vector_get(c, j); standarderror[j] = sqrt(gsl_vector_get(z, j)); tstat[j] = coefficients[j] / standarderror[j]; pvalue[j] = 2 * (1 - gsl_cdf_tdist_P(abs(tstat[j]), Residualdf)); Lower95[j] = coefficients[j] - abs(gsl_cdf_tdist_Pinv(0.025, Residualdf)) * standarderror[j]; Upper95[j] = coefficients[j] + abs(gsl_cdf_tdist_Pinv(0.025, Residualdf)) * standarderror[j]; } double MultipleR = sqrt(1 - chisq / gsl_stats_tss(y->data, y->stride, y->size)); double RSquare = 1 - chisq / gsl_stats_tss(y->data, y->stride, y->size); double AdjustedRSquare = 1 - (ResidualSS / Residualdf) / (TotalSS / Totaldf); double StandardError = sqrt(ResidualMS); long int Observations = N; std::string outputfilename; outputfilename = "output.txt"; std::ofstream outputfile; outputfile.open(outputfilename); outputfile << std::fixed; outputfile.unsetf(std::ios::floatfield); outputfile.precision(10); outputfile << "SUMMARY OUTPUT" << std::endl; outputfile << std::endl; outputfile << " Regression Statistics" << std::endl; outputfile << std::left << std::setw(17) << "Multiple R" << std::right << std::setw(17) << MultipleR << std::endl; outputfile << std::left << std::setw(17) << "R Square" << std::right << std::setw(17) << RSquare << std::endl; outputfile << std::left << std::setw(17) << "Adjusted R Square" << std::right << std::setw(17) << AdjustedRSquare << std::endl; outputfile << std::left << std::setw(17) << "Standard Error" << std::right << std::setw(17) << StandardError << std::endl; outputfile << std::left << std::setw(17) << "Observations" << std::right << std::setw(17) << Observations << std::endl; outputfile << std::endl; //ANOVA double RegressionF = (RegressionSS / Regressiondf) / (ResidualSS / Residualdf); double SignificanceF = 1 - gsl_cdf_fdist_P(abs(RegressionF), Regressiondf, Residualdf); outputfile << "ANOVA" << std::endl; outputfile << std::setw(17) << "" << std::setw(17) << "df" << std::setw(17) << "SS" << std::setw(17) << "MS" << std::setw(17) << "F" << std::setw(17) << "Significance F" << std::endl; outputfile << std::left << std::setw(17) << "Regression" << std::right << std::setw(17) << Regressiondf << std::setw(17) << RegressionSS << std::setw(17) << RegressionMS << std::setw(17) << RegressionF << std::setw(17) << SignificanceF << std::endl; outputfile << std::left << std::setw(17) << "Residual" << std::right << std::setw(17) << Residualdf << std::setw(17) << ResidualSS << std::setw(17) << ResidualMS << std::endl; outputfile << std::left << std::setw(17) << "Total" << std::right << std::setw(17) << Totaldf << std::setw(17) << TotalSS << std::endl; outputfile << std::endl; outputfile << std::setw(17) << "" << std::setw(17) << "Coefficients" << std::setw(17) << "Standard Error" << std::setw(17) << "t Stat" << std::setw(17) << "P-value" << std::setw(17) << "Lower 95%" << std::setw(17) << "Upper 95%" << std::endl; outputfile << std::left << std::setw(10) << "Intercept" << std::setw(7) << "" << std::right << std::setw(17) << coefficients[0] << std::setw(17) << standarderror[0] << std::setw(17) << tstat[0] << std::setw(17) << pvalue[0] << std::setw(17) << Lower95[0] << std::setw(17) << Upper95[0] << std::endl; for (int j = 1; j < numcols; j++) { outputfile << std::left << std::setw(10) << "X Variable" << std::setw(7) << j << std::right << std::setw(17) << coefficients[j] << std::setw(17) << standarderror[j] << std::setw(17) << tstat[j] << std::setw(17) << pvalue[j] << std::setw(17) << Lower95[j] << std::setw(17) << Upper95[j] << std::endl; } gsl_matrix_free(X); gsl_matrix_free(cov); gsl_vector_free(y); gsl_vector_free(c); std::cout << "Finished!" << std::endl << "Press Enter."; std::cin.get(); return 0; }