From 7dc29680fc0dee10506ac6389afe2f162af0d7db Mon Sep 17 00:00:00 2001 From: Eton Tackett Date: Mon, 2 Mar 2026 15:32:40 -0600 Subject: [PATCH 01/12] Add dataset utilities and tests --- src/dataset.jl | 133 ++++++++++++++++++++++++++++++++++++++++++ test/dataset_tests.jl | 0 2 files changed, 133 insertions(+) create mode 100644 src/dataset.jl create mode 100644 test/dataset_tests.jl diff --git a/src/dataset.jl b/src/dataset.jl new file mode 100644 index 0000000..52c0153 --- /dev/null +++ b/src/dataset.jl @@ -0,0 +1,133 @@ + +using CSV +using DataFrames +using Downloads + +export Dataset, csv_dataset + +""" + Dataset(name, X, y) + +Contains datasets for ridge regression experiments. + +# Fields +- `name::String`: Name of dataset +- `X::Matrix{Float64}`: Matrix of variables/features +- `y::Vector{Float64}`: Target vector + +# Throws +- `ArgumentError`: If rows in `X` does not equal length of `y`. + +# Notes +Used as the experimental unit for ridge regression experiments. +""" +struct Dataset + name::String + X::Matrix{Float64} + y::Vector{Float64} + + function Dataset(name::String, X::AbstractMatrix, y::AbstractVector) + size(X, 1) == length(y) || + throw(ArgumentError("X and y must have same number of rows")) + + new(name, Matrix{Float64}(X), Vector{Float64}(y)) + end +end + +""" + one_hot_encode(dataset::Dataset; drop_first=true) + +One-hot encode categorical (string-like) features in `dataset.X`. + +# Arguments +- `dataset::Dataset`: Input dataset containing feature matrix `X` + and response vector `y`. + +# Keyword Arguments +- `drop_first::Bool=true`: If `true`, drop the first dummy column for + each categorical feature to avoid multicollinearity. + +# Returns +A new `Dataset` with numeric `X` and unchanged `y`. +""" +function one_hot_encode(Xdf::DataFrame; drop_first::Bool = true)::Matrix{Float64} + n = nrow(Xdf) + cols = Vector{Vector{Float64}}() + + for name in names(Xdf) + col = Xdf[!, name] + if eltype(col) <: Real + push!(cols, Float64.(col)) + continue + end + + scol = string.(col) + lv = unique(scol) + ind = scol .== permutedims(lv) + + println("Variable: $name") + for (j, level) in enumerate(lv) + println(" Dummy column (before drop) $j → $name = $level") + end + + if drop_first && size(ind, 2) > 1 + ind = ind[:, 2:end] + end + + for j in 1:size(ind, 2) + push!(cols, Float64.(ind[:, j])) + end + end + + p = length(cols) + X = Matrix{Float64}(undef, n, p) + for j in 1:p + X[:, j] = cols[j] + end + + return Matrix{Float64}(X) + +end +""" + csv_dataset(path_or_url; target_col, name="csv_dataset") + +Load a dataset from a CSV file or URL. + +# Arguments +- `path_or_url::String` + Local file path or web URL that has CSV data. + +- `target_col` + Column index OR column name containing the response variable. + +- `name::String` + Dataset name. + +# Returns +`Dataset` +""" +function csv_dataset(path_or_url::String; + target_col, + name::String = "csv_dataset" +) + + filepath = + startswith(path_or_url, "http") ? + Downloads.download(path_or_url) : + path_or_url + + df = DataFrame(CSV.File(filepath)) + df = dropmissing(df) + Xdf = select(df, DataFrames.Not(target_col)) + + y = target_col isa Int ? + df[:, target_col] : + df[:, Symbol(target_col)] + + + X = one_hot_encode(Xdf; drop_first = true) + + + + return Dataset(name, Matrix{Float64}(X), Vector{Float64}(y)) +end diff --git a/test/dataset_tests.jl b/test/dataset_tests.jl new file mode 100644 index 0000000..e69de29 From 47fc954ddfe6fdf09c3c998929432b782079f760 Mon Sep 17 00:00:00 2001 From: Eton Tackett Date: Tue, 3 Mar 2026 10:27:21 -0600 Subject: [PATCH 02/12] Adding dataset_tests.jl --- Project.toml | 10 ++++++- docs/make.jl | 1 + docs/src/design.md | 32 +++++++++++++++++++++ src/dataset.jl | 1 - test/dataset_tests.jl | 66 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 108 insertions(+), 2 deletions(-) create mode 100644 docs/src/design.md diff --git a/Project.toml b/Project.toml index 86ea0a5..427e8aa 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,15 @@ name = "RidgeRegression" uuid = "739161c8-60e1-4c49-8f89-ff30998444b1" -authors = ["Vivak Patel "] version = "0.1.0" +authors = ["Eton Tackett ", "Vivak Patel "] + +[deps] +CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" [compat] +CSV = "0.10.15" +DataFrames = "1.8.1" +Downloads = "1.7.0" julia = "1.12.4" diff --git a/docs/make.jl b/docs/make.jl index a1097bb..d42cfbe 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -14,6 +14,7 @@ makedocs(; ), pages=[ "Home" => "index.md", + "Design" => "design.md", ], ) diff --git a/docs/src/design.md b/docs/src/design.md new file mode 100644 index 0000000..c83592e --- /dev/null +++ b/docs/src/design.md @@ -0,0 +1,32 @@ +# Motivation and Background +Many modern applications, such as genome-wide association studies (GWAS) involve regression problems with a large number of predictors. Traditional least squares methods fail due to noise and ill-conditioning. Penalized Least Squares (PLS) extends ordinary least squares (OLS) regression by adding a penalty term to shrink parameter estimates. The goal is to select the best possible model, "best" in the sense that we find the best tradeoff between goodness of fit and model complexity. Ridge regression, an approach within PLS, adds a regularization term. + +# Questions +Key Questions: +Which ridge regression algorithm is provides the best balance between: +-Numerical stability +-Computational aspects (GPU/CPU, runtime, etc) +-Predicative accuracy +# Experimental Units +The experimental units are the datasets under fixed penalty weights. Due to the statistical behavior of ridge regression algorithms depends strongly on the dimensional structure of the problem, a blocking procedure will be used. Datasets will be grouped according to their dimensional regime, characterized as p >> n, p ≈ n, and p << n. These regimes correspond to fundamentally different geometric properties of the design matrix, including rank behavior, conditioning, and the stability of the normal equations. + +In addition to dimensional regime, matrix conditioning will be incorporated as a secondary blocking factor. The condition number of the design matrix quantifies the sensitivity of the regression problem to perturbations in the data and directly affects numerical stability and convergence behavior of ridge solution methods. Ill-conditioned matrices have slow convergence and are sensitive to errors, while well-conditioned matrices tend to produce stable and rapidly convergent behavior. + +| Blocking System | Factor | Blocks | +|:----------------|:-------|:-------| +| Dataset | Dimensional regime (\(p/n\)) | \( p \ll n \), \( p = n \), \( p \gg n \) | +| Matrix conditioning | Condition number of \( X \) or \( X^T X \) | Low, Medium, High | +# Treatments +The treatments are the ridge regression solution methods: +-Gradient descent +-Stochastic gradient descent +-Closed-form solutions +# Observational Units and Measurements +The observational units are each algorithm-dataset pair. For each combination we will observe the following +| Measurement System | Factor | Measurements | +|:--------------------------|:--------------------------|:-------------| +| Predictive Performance | Prediction error | Training MSE, Test MSE, RMSE, R² | +| Estimation Accuracy | Parameter recovery | ‖β̂ − β_true‖₂² | (if known) +| Computational Performance | Efficiency | Runtime (seconds), Iterations to convergence | +| Numerical Stability | Solution accuracy | Perturbation sensitivity | +| Model Complexity | Coefficient magnitude | ‖β̂‖₂ | \ No newline at end of file diff --git a/src/dataset.jl b/src/dataset.jl index 52c0153..d135557 100644 --- a/src/dataset.jl +++ b/src/dataset.jl @@ -1,4 +1,3 @@ - using CSV using DataFrames using Downloads diff --git a/test/dataset_tests.jl b/test/dataset_tests.jl index e69de29..cefa8ce 100644 --- a/test/dataset_tests.jl +++ b/test/dataset_tests.jl @@ -0,0 +1,66 @@ +using Test +using DataFrames +using CSV + +include("../src/dataset.jl") +@testset "Dataset" begin + X = [1 2; 3 4] + y = [10, 20] + d = Dataset("toy", X, y) + + @test d.name == "toy" + @test d.X isa Matrix{Float64} + @test d.y isa Vector{Float64} + @test size(d.X) == (2, 2) + @test length(d.y) == 2 + @test d.X[1, 1] == 1.0 + @test d.y[2] == 20.0 + + @test_throws ArgumentError Dataset("bad", X, [1, 2, 3]) +end + +@testset "one_hot_encode" begin + df = DataFrame( + A = ["red", "blue", "red", "green"], + B = [1, 2, 3, 4], + C = ["small", "large", "medium", "small"] + ) + + X = redirect_stdout(devnull) do + one_hot_encode(df; drop_first = true) + end + + @test size(X) == (4, 5) + @test X[:, 3] == [1.0, 2.0, 3.0, 4.0] + @test all(x -> x == 0.0 || x == 1.0, X[:, [1,2,4,5]]) + @test all(vec(sum(X[:, 1:2]; dims=2)) .<= 1) + @test all(vec(sum(X[:, 4:5]; dims=2)) .<= 1) +end + +@testset "csv_dataset" begin + tmp = tempname() * ".csv" + df = DataFrame( + a = [1.0, 2.0, missing, 4.0], + b = ["x", "y", "y", "x"], + y = [10.0, 20.0, 30.0, 40.0] + ) + CSV.write(tmp, df) + + d = redirect_stdout(devnull) do + csv_dataset(tmp; target_col=:y, name="tmp") + end + + @test d.name == "tmp" + @test d.X isa Matrix{Float64} + @test d.y isa Vector{Float64} + + @test length(d.y) == 3 + @test size(d.X, 1) == 3 + @test d.y == [10.0, 20.0, 40.0] + + d2 = redirect_stdout(devnull) do + csv_dataset(tmp; target_col=3, name="tmp2") + end + @test d2.y == [10.0, 20.0, 40.0] + @test size(d2.X, 1) == 3 +end From 787a88dc8a58861c69e32562f73a1727db0d52bc Mon Sep 17 00:00:00 2001 From: Eton Tackett Date: Tue, 3 Mar 2026 10:31:54 -0600 Subject: [PATCH 03/12] Small changes to design.md --- docs/src/design.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/design.md b/docs/src/design.md index c83592e..5c02373 100644 --- a/docs/src/design.md +++ b/docs/src/design.md @@ -14,7 +14,7 @@ In addition to dimensional regime, matrix conditioning will be incorporated as a | Blocking System | Factor | Blocks | |:----------------|:-------|:-------| -| Dataset | Dimensional regime (\(p/n\)) | \( p \ll n \), \( p = n \), \( p \gg n \) | +| Dataset | Dimensional regime (\(p/n\)) | $(p \ll n)$, $(p \approx n)$, $(p \gg n)$| | Matrix conditioning | Condition number of \( X \) or \( X^T X \) | Low, Medium, High | # Treatments The treatments are the ridge regression solution methods: From 2dd229507d2dc9b07c2867c06a4779a55d1944f9 Mon Sep 17 00:00:00 2001 From: Eton Tackett Date: Mon, 16 Mar 2026 22:28:05 -0500 Subject: [PATCH 04/12] March 16 Updates --- src/dataset.jl | 42 ++++++++++++++++++------------------------ test/dataset_tests.jl | 2 +- 2 files changed, 19 insertions(+), 25 deletions(-) diff --git a/src/dataset.jl b/src/dataset.jl index d135557..18d7b6b 100644 --- a/src/dataset.jl +++ b/src/dataset.jl @@ -53,28 +53,25 @@ function one_hot_encode(Xdf::DataFrame; drop_first::Bool = true)::Matrix{Float64 n = nrow(Xdf) cols = Vector{Vector{Float64}}() - for name in names(Xdf) + for name in names(Xdf) #Selecting columns that aren't the target variable and pushing them to the columns. col = Xdf[!, name] if eltype(col) <: Real push!(cols, Float64.(col)) continue end - scol = string.(col) - lv = unique(scol) - ind = scol .== permutedims(lv) + scol = string.(col) # Convert to string for categorical processing. + lv = unique(scol) #Get unique category levels. + ind = scol .== permutedims(lv) #Create indicator matrix for each level of the categorical variable. + #Permutedims is used to align the dimensions for broadcasting. + #Broadcasting compares each element of `scol` with each level in `lv`, resulting in a matrix where each column corresponds to a level and contains `true` for rows that match that level and `false` otherwise. - println("Variable: $name") - for (j, level) in enumerate(lv) - println(" Dummy column (before drop) $j → $name = $level") - end - - if drop_first && size(ind, 2) > 1 + if drop_first && size(ind, 2) > 1 #Drop the first column of the indicator matrix to avoid multicollinearity if drop_first is true and there are multiple levels. ind = ind[:, 2:end] end for j in 1:size(ind, 2) - push!(cols, Float64.(ind[:, j])) + push!(cols, Float64.(ind[:, j])) #Convert the boolean indicator columns to Float64 and add them to the list of columns. end end @@ -93,17 +90,14 @@ end Load a dataset from a CSV file or URL. # Arguments -- `path_or_url::String` - Local file path or web URL that has CSV data. +- `path_or_url::String`: Local file path or web URL containing CSV data. -- `target_col` - Column index OR column name containing the response variable. - -- `name::String` - Dataset name. +# Keyword Arguments +- `target_col`: Column index or column name containing the response variable. +- `name::String="csv_dataset"`: Dataset name. # Returns -`Dataset` +A `Dataset`. """ function csv_dataset(path_or_url::String; target_col, @@ -115,13 +109,13 @@ function csv_dataset(path_or_url::String; Downloads.download(path_or_url) : path_or_url - df = DataFrame(CSV.File(filepath)) - df = dropmissing(df) - Xdf = select(df, DataFrames.Not(target_col)) + df = DataFrame(CSV.File(filepath)) #Read CSV file into a DataFrame. + df = dropmissing(df) #Remove rows with missing values. + Xdf = select(df, DataFrames.Not(target_col)) #Select all columns except the target column for features. y = target_col isa Int ? - df[:, target_col] : - df[:, Symbol(target_col)] + df[:, target_col] : #If target_col is an integer, use it as a column index to extract the target variable from the DataFrame. + df[:, Symbol(target_col)] #Extract the target variable based on whether target_col is an index or a name. X = one_hot_encode(Xdf; drop_first = true) diff --git a/test/dataset_tests.jl b/test/dataset_tests.jl index cefa8ce..5652c96 100644 --- a/test/dataset_tests.jl +++ b/test/dataset_tests.jl @@ -2,7 +2,7 @@ using Test using DataFrames using CSV -include("../src/dataset.jl") +using RidgeRegression @testset "Dataset" begin X = [1 2; 3 4] y = [10, 20] From 173cdd14a1197e6bbe003723fa6e8df87a58575d Mon Sep 17 00:00:00 2001 From: Eton Tackett Date: Mon, 16 Mar 2026 22:29:22 -0500 Subject: [PATCH 05/12] Ridge Regression file --- src/RidgeRegression.jl | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/RidgeRegression.jl b/src/RidgeRegression.jl index c32de91..5ef574e 100644 --- a/src/RidgeRegression.jl +++ b/src/RidgeRegression.jl @@ -1,5 +1,11 @@ module RidgeRegression +using CSV +using DataFrames +using Downloads + +export Dataset, csv_dataset, one_hot_encode + +include("dataset.jl") -# Write your package code here. end From 042d5d60253d7b4e708fbc92a13ba1062df1d71a Mon Sep 17 00:00:00 2001 From: Eton Tackett Date: Mon, 16 Mar 2026 23:12:52 -0500 Subject: [PATCH 06/12] dataset.jl small update --- src/dataset.jl | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/dataset.jl b/src/dataset.jl index 18d7b6b..5f1b77c 100644 --- a/src/dataset.jl +++ b/src/dataset.jl @@ -1,9 +1,3 @@ -using CSV -using DataFrames -using Downloads - -export Dataset, csv_dataset - """ Dataset(name, X, y) @@ -17,8 +11,8 @@ Contains datasets for ridge regression experiments. # Throws - `ArgumentError`: If rows in `X` does not equal length of `y`. -# Notes -Used as the experimental unit for ridge regression experiments. +!!! note + Used as the experimental unit for ridge regression experiments. """ struct Dataset name::String From 70cfa67631c488de2828e9fa2a9e377ab4d2abbe Mon Sep 17 00:00:00 2001 From: Eton Tackett Date: Mon, 16 Mar 2026 23:19:22 -0500 Subject: [PATCH 07/12] Updated Experimental Units and Treatments Sections --- docs/src/design.md | 64 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 14 deletions(-) diff --git a/docs/src/design.md b/docs/src/design.md index 5c02373..4795866 100644 --- a/docs/src/design.md +++ b/docs/src/design.md @@ -1,32 +1,68 @@ # Motivation and Background -Many modern applications, such as genome-wide association studies (GWAS) involve regression problems with a large number of predictors. Traditional least squares methods fail due to noise and ill-conditioning. Penalized Least Squares (PLS) extends ordinary least squares (OLS) regression by adding a penalty term to shrink parameter estimates. The goal is to select the best possible model, "best" in the sense that we find the best tradeoff between goodness of fit and model complexity. Ridge regression, an approach within PLS, adds a regularization term. +Many modern science problems involve regression problems with extremely large numbers of predictors. Genome-wide association studies (GWAS), for example, try to identify genetic variants associated with a disease phenotype using hundreds of thousands or millions of genomic features. In such settings, traditional least squares methods fail because noise and ill-conditioning. Penalized Least Squares (PLS) extends ordinary least squares (OLS) regression by adding a penalty term to shrink parameter estimates. Ridge regression, an approach within PLS, adds a regularization term, producing a regularized estimator that helps to stabalize the solution. +There are many numerical algorithms available to compute ridge regression estimates including direct methods, Krylov subspace methods, gradient-based optimization, coordinate descent, and stochastic gradient descent. These algorithms differ in their computational cost, memory requirements, and numerical stability. + +The goal of this experiment is to investigate the performance of these algorithms when we vary the structure and scale of the regression problem. To do this, we consider the linear model $\mathbf{y} = X\boldsymbol{\beta} + \boldsymbol{\varepsilon}$ where the matrix ${X}$ will be constructed with varying dimensions, sparsity patterns, and conditioning properties. # Questions Key Questions: Which ridge regression algorithm is provides the best balance between: --Numerical stability --Computational aspects (GPU/CPU, runtime, etc) --Predicative accuracy + +- Numerical stability +- Computational aspects (GPU/CPU, runtime, etc) # Experimental Units -The experimental units are the datasets under fixed penalty weights. Due to the statistical behavior of ridge regression algorithms depends strongly on the dimensional structure of the problem, a blocking procedure will be used. Datasets will be grouped according to their dimensional regime, characterized as p >> n, p ≈ n, and p << n. These regimes correspond to fundamentally different geometric properties of the design matrix, including rank behavior, conditioning, and the stability of the normal equations. +The experimental units are the datasets under fixed penalty weights. Each dataset will contain a matrix ${X}$, a response vector $\mathbf{y}$, and a regularization parameter ${\lambda}$. Because the statistical behavior of ridge regression algorithms depends strongly on the dimensional structure of the problem, a blocking system will be used. Datasets will be grouped according to their dimensional regime, characterized as $p \ll n$, p ≈ n, and $p \gg n$. These regimes correspond to fundamentally different geometric properties of the design matrix, including rank behavior, conditioning, and the stability of the normal equations. + +In addition to dimensional block, the strength of the ridge penalty will be incorporated as a secondary blocking factor. The ridge estimator is $\hat{\beta_R} = (X^\top X + \lambda I)^{-1}X^\top y$. The matrix conditioning number is defined as $\kappa(A) = \frac{\sigma_{\max}(A)}{\sigma_{\min}(A)}$. In the context of ridge regression, the regularization parameter ${\lambda}$, can impact the conditioning number. Let $X = U\Sigma V^\top$ be the SVD of $X$, with singular values $\sigma_1,\dots,\sigma_p$. + +Then + +$$ +X^\top X = V \Sigma^\top \Sigma V^\top += V \,\mathrm{diag}(\sigma_1^2,\dots,\sigma_p^2)\, V^\top . +$$ + +Adding the ridge term gives -In addition to dimensional regime, matrix conditioning will be incorporated as a secondary blocking factor. The condition number of the design matrix quantifies the sensitivity of the regression problem to perturbations in the data and directly affects numerical stability and convergence behavior of ridge solution methods. Ill-conditioned matrices have slow convergence and are sensitive to errors, while well-conditioned matrices tend to produce stable and rapidly convergent behavior. +$$ +X^\top X + \lambda I += +V \,\mathrm{diag}(\sigma_1^2+\lambda,\dots,\sigma_p^2+\lambda)\, V^\top . +$$ +$$ +\kappa_2(X^\top X+\lambda I) += +\frac{\sigma_{\max}^2+\lambda}{\sigma_{\min}^2+\lambda}. +$$ + +When ${\lambda}$ is small and the singular values are large, we aren't changing the conditioning number much. As such, the ridge penalty won't really affect the condition number. Conversely, if $\sigma_{\min}$ is close to 0, we have an ill-posed problem (Existence, uniqueness, stability not satisfied). However if a large ${\lambda}$ is chosen, the condition number is reduced and the problem becomes more stable. This behavior motivates blocking experiments according to the effective conditioning induced by the ridge penalty, allowing algorithm performance to be compared across well-posed and ill-posed regression settings. + +Another blocking factor that will be considered is how sparse or dense the matrix X is. Many algorithms behave differently depending on whether the matrix is sparse or dense. In ridge regression, there are many operations involving X including matrix-matrix products and matrix-vector products. A dense matrix leads to high computational cost whereas a sparse matrix we can significantly reduce the cost. As such, different algorithms may perform better depending on the sparsity structure of X, making matrix sparsity a relevant blocking factor when comparing algorithm behavior and computational efficiency. | Blocking System | Factor | Blocks | |:----------------|:-------|:-------| | Dataset | Dimensional regime (\(p/n\)) | $(p \ll n)$, $(p \approx n)$, $(p \gg n)$| -| Matrix conditioning | Condition number of \( X \) or \( X^T X \) | Low, Medium, High | +| Ridge Penalty| Value of ${\lambda}$ relative to the singular values | Small, Large (Frobenius Norm range of values, Calculate Singular Values)| +| Matrix Sparsity| Density of non-zero values in X | Sparse, Moderate, Dense (Need to ask about how to quantify this)| # Treatments +(How many treatments should we expect) +(What treatments correspond to which blocking system) + The treatments are the ridge regression solution methods: --Gradient descent --Stochastic gradient descent --Closed-form solutions + +- Gradient-based optimization +- Stochastic gradient descent +- Direct Methods + +For each experimental unit, all treatments will be applied to the dataset. This will be done so that differences in performance can be attributed to the algorithms themselves rather than the data. The order of treatment application will be completely randomized to avoid any systemic bias. If there are b possible combinations of the levels of each blocking factor + +The total number of block combinations is determined by the product of the number of levels in each blocking factor. For example, if the experiment includes three dimensional regimes, two sparsity levels, and two regularization strengths, then there are $3 * 2 * 2 = 12$ block combinations. # Observational Units and Measurements +Explanation needed +(How many rows/columns to expect) +(Explain measurements: Floating point, real, etc. Meaning of values, what values it can take.) The observational units are each algorithm-dataset pair. For each combination we will observe the following | Measurement System | Factor | Measurements | |:--------------------------|:--------------------------|:-------------| -| Predictive Performance | Prediction error | Training MSE, Test MSE, RMSE, R² | -| Estimation Accuracy | Parameter recovery | ‖β̂ − β_true‖₂² | (if known) | Computational Performance | Efficiency | Runtime (seconds), Iterations to convergence | -| Numerical Stability | Solution accuracy | Perturbation sensitivity | -| Model Complexity | Coefficient magnitude | ‖β̂‖₂ | \ No newline at end of file +| Numerical Stability | Solution accuracy | Perturbation sensitivity | \ No newline at end of file From 9b8c48ce777462643bc71f9b849b7b8fc1ca78ac Mon Sep 17 00:00:00 2001 From: Eton Tackett Date: Mon, 16 Mar 2026 23:38:17 -0500 Subject: [PATCH 08/12] Small changes --- src/dataset.jl | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/dataset.jl b/src/dataset.jl index 5f1b77c..4e20c89 100644 --- a/src/dataset.jl +++ b/src/dataset.jl @@ -28,20 +28,19 @@ struct Dataset end """ - one_hot_encode(dataset::Dataset; drop_first=true) + one_hot_encode(Xdf::DataFrame; drop_first=true) -One-hot encode categorical (string-like) features in `dataset.X`. +One-hot encode categorical (string-like) features in `Xdf`. # Arguments -- `dataset::Dataset`: Input dataset containing feature matrix `X` - and response vector `y`. +- `Xdf::DataFrame`: Input DataFrame containing features and response vector `y`. # Keyword Arguments - `drop_first::Bool=true`: If `true`, drop the first dummy column for each categorical feature to avoid multicollinearity. # Returns -A new `Dataset` with numeric `X` and unchanged `y`. +- `Matrix{Float64}`: A numeric matrix containing the encoded feature. """ function one_hot_encode(Xdf::DataFrame; drop_first::Bool = true)::Matrix{Float64} n = nrow(Xdf) @@ -91,7 +90,7 @@ Load a dataset from a CSV file or URL. - `name::String="csv_dataset"`: Dataset name. # Returns -A `Dataset`. +- `Dataset`: A dataset containing the encoded feature matrix `X`, response vector `y`, and dataset name. """ function csv_dataset(path_or_url::String; target_col, From 6c923abdc5770c213f8cfe223dbc94682a301f9f Mon Sep 17 00:00:00 2001 From: Eton Tackett Date: Tue, 17 Mar 2026 10:34:42 -0500 Subject: [PATCH 09/12] Bidiagonalization Stuff --- src/bidiagonalization.jl | 146 ++++++++++++++++++++++++++++++++ test/bidiagonalization_tests.jl | 137 ++++++++++++++++++++++++++++++ 2 files changed, 283 insertions(+) create mode 100644 src/bidiagonalization.jl create mode 100644 test/bidiagonalization_tests.jl diff --git a/src/bidiagonalization.jl b/src/bidiagonalization.jl new file mode 100644 index 0000000..5405e7e --- /dev/null +++ b/src/bidiagonalization.jl @@ -0,0 +1,146 @@ +""" + compute_givens(a, b) + +Compute Givens rotation coefficients for scalars `a` and `b`. + +# Arguments +- `a::Number`: First scalar +- `b::Number`: Second scalar + +# Returns +- `c::Number`: Cosine coefficient of the Givens rotation +- `s::Number`: Sine coefficient of the Givens rotation + +""" +function compute_givens(a::Number, b::Number) # Compute Givens rotation coefficients for scalars a and b + if b == 0 + return one(a), zero(a) + elseif a == 0 + throw(ArgumentError("a is zero, cannot compute Givens rotation")) + else + r = hypot(a, b) + c = a / r + s = b / r + return c, s + end +end + +""" + rotate_rows!(M::AbstractMatrix,i::Int,j::Int,c::Number,s::Number) + +Apply a Givens rotation to rows `i` and `j` of matrix `M`. + +# Arguments +- `M::AbstractMatrix`: The matrix to be rotated +- `i::Int`: First row index +- `j::Int`: Second row index +- `c::Number`: Cosine coefficient of the Givens rotation +- `s::Number`: Sine coefficient of the Givens rotation + +# Returns +- `M::AbstractMatrix`: The rotated matrix + +""" +function rotate_rows!(M::AbstractMatrix,i::Int,j::Int,c::Number,s::Number) + for k in 1:size(M,2) # Loop over columns + temp = M[i,k] # Store the original value of M[i,k] before modification + M[i,k] = c*temp + s*M[j,k] + M[j,k] = -conj(s)*temp + c*M[j,k] #Apply the Givens rotation to the elements in rows i and j + end + return M +end + + +""" + rotate_cols!(M::AbstractMatrix,i::Int,j::Int,c::Number,s::Number) + +Apply a Givens rotation to columns `i` and `j` of matrix `M`. + +# Arguments +- `M::AbstractMatrix`: The matrix to be rotated +- `i::Int`: First column index +- `j::Int`: Second column index +- `c::Number`: Cosine coefficient of the Givens rotation +- `s::Number`: Sine coefficient of the Givens rotation + +# Returns +- `M::AbstractMatrix`: The rotated matrix + +""" +function rotate_cols!(M::AbstractMatrix,i::Int,j::Int,c::Number,s::Number) + for k in 1:size(M,1) # Loop over rows + temp = M[k,i] # Store the original value of M[k,i] before modification + M[k,i] = c*temp + s*M[k,j] + M[k,j] = -conj(s)*temp + c*M[k,j] # Apply the Givens rotation to the elements in columns i and j + end + return M +end +""" + bidiagonalize_with_H(A::AbstractMatrix, L::AbstractMatrix, b::AbstractVector) + +Performs bidiagonalization of a matrix A using a sequence of Givens transformations while explicitly accumulating +the orthogonal left factor `H` and right factor `K` such that + + H' * A * K = B. + +# Arguments +- `A::AbstractMatrix`: The matrix to be bidiagonalized +- `L::AbstractMatrix`: The banded matrix to be updated in-place +- `b::AbstractVector`: The vector to be transformed + +# Returns +- `B::AbstractMatrix`: The bidiagonalized form of the input matrix A with dimension (n,n) +- `C::AbstractMatrix`: The matrix resulting from the sequence of Givens transformations +- `H::AbstractMatrix`: The orthogonal left factor +- `K::AbstractMatrix`: The orthogonal right factor +- `Ht::AbstractMatrix`: The transpose of the orthogonal left factor +- `b::AbstractVector`: The vector resulting from applying the Givens transformations to the input vector b + +""" + +function bidiagonalize_with_H(A::AbstractMatrix, L::AbstractMatrix, b::AbstractVector) + m, n = size(A) + + B = copy(A) #Will be transformed into bidiagonal form + C = copy(L) + bhat = copy(b) #Will recieve same left transformations applied to A + + Ht = Matrix{eltype(A)}(I, m, m) #Ht will accumulate the left transformations, initialized as identity + K = Matrix{eltype(A)}(I, n, n) #K will accumulate the right transformations, initialized as identity + + imax = min(m, n) + + for i in 1:imax + # Left Givens rotations: zero below diagonal in column i + for j in m:-1:(i + 1) + if B[j, i] != 0 + c, s = compute_givens(B[i, i], B[j, i]) #Build Givens rotation to zero B[j, i] + rotate_rows!(B, i, j, c, s) #Apply the Givens rotation to rows i and j of B + rotate_rows!(Ht, i, j, c, s) #Accumulate the left transformations in Ht + + temp = bhat[i] + bhat[i] = c*temp + s*bhat[j] + bhat[j] = -conj(s)*temp + c*bhat[j] #Applying same left rotation to bhat, bhat tracks how b is transformed by the left transformations. + + B[j, i] = zero(eltype(B)) + end + end + + # Right Givens rotations: zero entries right of superdiagonal + if i <= n - 2 + for k in n:-1:(i + 2) + if B[i, k] != 0 + c, s = compute_givens(B[i, i + 1], B[i, k]) #Build Givens rotation to zero B[i, k] + #s = -s + rotate_cols!(B, i + 1, k, c, s) #Apply the Givens rotation to columns i+1 and k of B + rotate_cols!(C, i + 1, k, c, s) #Apply the same right rotation to C, since C is updated by the right transformations + rotate_cols!(K, i + 1, k, c, s) #Accumulate the right transformations in K + B[i, k] = zero(eltype(B)) + end + end + end + end + + H = transpose(Ht) + return B, C, H, K, Ht, bhat +end diff --git a/test/bidiagonalization_tests.jl b/test/bidiagonalization_tests.jl new file mode 100644 index 0000000..5fa22e8 --- /dev/null +++ b/test/bidiagonalization_tests.jl @@ -0,0 +1,137 @@ +using Test +using LinearAlgebra +using RidgeRegression + +@testset "compute_givens" begin + c, s = compute_givens(3.0, 0.0) + @test c == 1.0 + @test s == 0.0 + a = 3.0 + b = 4.0 + c, s = compute_givens(a, b) + v1 = c*a + s*b + v2 = -s*a + c*b + @test isapprox(v2, 0.0; atol=1e-12, rtol=0) + @test isapprox(abs(v1), hypot(a, b); atol=1e-12, rtol=0) + @test_throws ArgumentError compute_givens(0.0, 2.0) +end +@testset "rotate_rows!" begin + M = [1.0 2.0; + 3.0 4.0] + + c, s = compute_givens(1.0, 3.0) + rotate_rows!(M, 1, 2, c, s) + + @test isapprox(M[2, 1], 0.0; atol=1e-12, rtol=0) +end + +@testset "rotate_cols!" begin + M = [3.0 4.0; + 1.0 2.0] + + c, s = compute_givens(3.0, 4.0) + rotate_cols!(M, 1, 2, c, s) + + @test isapprox(M[1, 2], 0.0; atol=1e-12, rtol=0) +end +@testset "bidiagonalize_with_H basic properties" begin + A = [1.0 2.0 3.0; + 4.0 5.0 6.0; + 7.0 8.0 10.0] + + L = Matrix{Float64}(I, 3, 3) + b = [1.0, 2.0, 3.0] + + B, C, H, K, Ht, bhat = bidiagonalize_with_H(A, L, b) + + # Ht should be the transpose of H + @test Ht ≈ H' + + # Orthogonality of H and K + @test H' * H ≈ Matrix{Float64}(I, 3, 3) + @test H * H' ≈ Matrix{Float64}(I, 3, 3) + @test K' * K ≈ Matrix{Float64}(I, 3, 3) + @test K * K' ≈ Matrix{Float64}(I, 3, 3) + + # Main factorization identity + @test H' * A * K ≈ B + + @test H' * b ≈ bhat + + # Applies right transforms to L, implicitly assuming J = I + @test L * K ≈ C +end + +@testset "bidiagonalize_with_H produces upper bidiagonal matrix" begin + A = [2.0 1.0 3.0; + 4.0 5.0 6.0; + 7.0 8.0 9.0] + + L = Matrix{Float64}(I, 3, 3) + b = [1.0, -1.0, 2.0] + + B, C, H, K, Ht, bhat = bidiagonalize_with_H(A, L, b) + + m, n = size(B) + + for i in 1:m + for j in 1:n + # In an upper bidiagonal matrix, only diagonal and superdiagonal may be nonzero + if !(j == i || j == i + 1) + @test isapprox(B[i, j], 0.0; atol=1e-10, rtol=0) + end + end + end +end + +@testset "bidiagonalize_with_H rectangular tall matrix" begin + A = [1.0 2.0 3.0; + 4.0 5.0 6.0; + 7.0 8.0 9.0; + 1.0 0.0 1.0] + + L = Matrix{Float64}(I, 3, 3) + b = [1.0, 2.0, 3.0, 4.0] + + B, C, H, K, Ht, bhat = bidiagonalize_with_H(A, L, b) + + m, n = size(A) + + @test size(B) == (m, n) + @test size(C) == size(L) + @test size(H) == (m, m) + @test size(K) == (n, n) + @test size(Ht) == (m, m) + @test length(bhat) == length(b) + + @test H' * H ≈ Matrix{Float64}(I, m, m) + @test K' * K ≈ Matrix{Float64}(I, n, n) + + @test H' * A * K ≈ B + @test H' * b ≈ bhat + @test L * K ≈ C + + for i in 1:m + for j in 1:n + if !(j == i || j == i + 1) + @test isapprox(B[i, j], 0.0; atol=1e-10, rtol=0) + end + end + end +end + +@testset "bidiagonalize_with_H identity-like simple case" begin + A = [3.0 0.0; + 4.0 5.0] + + L = Matrix{Float64}(I, 2, 2) + b = [1.0, 2.0] + + B, C, H, K, Ht, bhat = bidiagonalize_with_H(A, L, b) + + @test H' * A * K ≈ B + @test H' * b ≈ bhat + @test L * K ≈ C + + @test isapprox(B[2,1], 0.0; atol=1e-12, rtol=0) +end From 6727a9006ede2a5ee3f95fa40371c9745974acee Mon Sep 17 00:00:00 2001 From: Eton Tackett Date: Tue, 17 Mar 2026 11:00:18 -0500 Subject: [PATCH 10/12] Adding Linear Algebra to Project.toml --- test/Project.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/Project.toml b/test/Project.toml index 0c36332..73141b0 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,2 +1,5 @@ [deps] +CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" \ No newline at end of file From 0a6b96449fb89966f04df1a665ce19163842acfd Mon Sep 17 00:00:00 2001 From: Eton Tackett Date: Fri, 20 Mar 2026 19:02:48 -0500 Subject: [PATCH 11/12] Small Changes --- Project.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Project.toml b/Project.toml index 427e8aa..2e084bf 100644 --- a/Project.toml +++ b/Project.toml @@ -7,9 +7,11 @@ authors = ["Eton Tackett ", "Vivak Patel Date: Fri, 20 Mar 2026 19:04:42 -0500 Subject: [PATCH 12/12] Removing irrelevant things --- docs/src/design.md | 68 ------------------------ src/dataset.jl | 119 ------------------------------------------ test/dataset_tests.jl | 66 ----------------------- 3 files changed, 253 deletions(-) delete mode 100644 docs/src/design.md delete mode 100644 src/dataset.jl delete mode 100644 test/dataset_tests.jl diff --git a/docs/src/design.md b/docs/src/design.md deleted file mode 100644 index 4795866..0000000 --- a/docs/src/design.md +++ /dev/null @@ -1,68 +0,0 @@ -# Motivation and Background -Many modern science problems involve regression problems with extremely large numbers of predictors. Genome-wide association studies (GWAS), for example, try to identify genetic variants associated with a disease phenotype using hundreds of thousands or millions of genomic features. In such settings, traditional least squares methods fail because noise and ill-conditioning. Penalized Least Squares (PLS) extends ordinary least squares (OLS) regression by adding a penalty term to shrink parameter estimates. Ridge regression, an approach within PLS, adds a regularization term, producing a regularized estimator that helps to stabalize the solution. - -There are many numerical algorithms available to compute ridge regression estimates including direct methods, Krylov subspace methods, gradient-based optimization, coordinate descent, and stochastic gradient descent. These algorithms differ in their computational cost, memory requirements, and numerical stability. - -The goal of this experiment is to investigate the performance of these algorithms when we vary the structure and scale of the regression problem. To do this, we consider the linear model $\mathbf{y} = X\boldsymbol{\beta} + \boldsymbol{\varepsilon}$ where the matrix ${X}$ will be constructed with varying dimensions, sparsity patterns, and conditioning properties. -# Questions -Key Questions: -Which ridge regression algorithm is provides the best balance between: - -- Numerical stability -- Computational aspects (GPU/CPU, runtime, etc) -# Experimental Units -The experimental units are the datasets under fixed penalty weights. Each dataset will contain a matrix ${X}$, a response vector $\mathbf{y}$, and a regularization parameter ${\lambda}$. Because the statistical behavior of ridge regression algorithms depends strongly on the dimensional structure of the problem, a blocking system will be used. Datasets will be grouped according to their dimensional regime, characterized as $p \ll n$, p ≈ n, and $p \gg n$. These regimes correspond to fundamentally different geometric properties of the design matrix, including rank behavior, conditioning, and the stability of the normal equations. - -In addition to dimensional block, the strength of the ridge penalty will be incorporated as a secondary blocking factor. The ridge estimator is $\hat{\beta_R} = (X^\top X + \lambda I)^{-1}X^\top y$. The matrix conditioning number is defined as $\kappa(A) = \frac{\sigma_{\max}(A)}{\sigma_{\min}(A)}$. In the context of ridge regression, the regularization parameter ${\lambda}$, can impact the conditioning number. Let $X = U\Sigma V^\top$ be the SVD of $X$, with singular values $\sigma_1,\dots,\sigma_p$. - -Then - -$$ -X^\top X = V \Sigma^\top \Sigma V^\top -= V \,\mathrm{diag}(\sigma_1^2,\dots,\sigma_p^2)\, V^\top . -$$ - -Adding the ridge term gives - -$$ -X^\top X + \lambda I -= -V \,\mathrm{diag}(\sigma_1^2+\lambda,\dots,\sigma_p^2+\lambda)\, V^\top . -$$ -$$ -\kappa_2(X^\top X+\lambda I) -= -\frac{\sigma_{\max}^2+\lambda}{\sigma_{\min}^2+\lambda}. -$$ - -When ${\lambda}$ is small and the singular values are large, we aren't changing the conditioning number much. As such, the ridge penalty won't really affect the condition number. Conversely, if $\sigma_{\min}$ is close to 0, we have an ill-posed problem (Existence, uniqueness, stability not satisfied). However if a large ${\lambda}$ is chosen, the condition number is reduced and the problem becomes more stable. This behavior motivates blocking experiments according to the effective conditioning induced by the ridge penalty, allowing algorithm performance to be compared across well-posed and ill-posed regression settings. - -Another blocking factor that will be considered is how sparse or dense the matrix X is. Many algorithms behave differently depending on whether the matrix is sparse or dense. In ridge regression, there are many operations involving X including matrix-matrix products and matrix-vector products. A dense matrix leads to high computational cost whereas a sparse matrix we can significantly reduce the cost. As such, different algorithms may perform better depending on the sparsity structure of X, making matrix sparsity a relevant blocking factor when comparing algorithm behavior and computational efficiency. - -| Blocking System | Factor | Blocks | -|:----------------|:-------|:-------| -| Dataset | Dimensional regime (\(p/n\)) | $(p \ll n)$, $(p \approx n)$, $(p \gg n)$| -| Ridge Penalty| Value of ${\lambda}$ relative to the singular values | Small, Large (Frobenius Norm range of values, Calculate Singular Values)| -| Matrix Sparsity| Density of non-zero values in X | Sparse, Moderate, Dense (Need to ask about how to quantify this)| -# Treatments -(How many treatments should we expect) -(What treatments correspond to which blocking system) - -The treatments are the ridge regression solution methods: - -- Gradient-based optimization -- Stochastic gradient descent -- Direct Methods - -For each experimental unit, all treatments will be applied to the dataset. This will be done so that differences in performance can be attributed to the algorithms themselves rather than the data. The order of treatment application will be completely randomized to avoid any systemic bias. If there are b possible combinations of the levels of each blocking factor - -The total number of block combinations is determined by the product of the number of levels in each blocking factor. For example, if the experiment includes three dimensional regimes, two sparsity levels, and two regularization strengths, then there are $3 * 2 * 2 = 12$ block combinations. -# Observational Units and Measurements -Explanation needed -(How many rows/columns to expect) -(Explain measurements: Floating point, real, etc. Meaning of values, what values it can take.) -The observational units are each algorithm-dataset pair. For each combination we will observe the following -| Measurement System | Factor | Measurements | -|:--------------------------|:--------------------------|:-------------| -| Computational Performance | Efficiency | Runtime (seconds), Iterations to convergence | -| Numerical Stability | Solution accuracy | Perturbation sensitivity | \ No newline at end of file diff --git a/src/dataset.jl b/src/dataset.jl deleted file mode 100644 index 4e20c89..0000000 --- a/src/dataset.jl +++ /dev/null @@ -1,119 +0,0 @@ -""" - Dataset(name, X, y) - -Contains datasets for ridge regression experiments. - -# Fields -- `name::String`: Name of dataset -- `X::Matrix{Float64}`: Matrix of variables/features -- `y::Vector{Float64}`: Target vector - -# Throws -- `ArgumentError`: If rows in `X` does not equal length of `y`. - -!!! note - Used as the experimental unit for ridge regression experiments. -""" -struct Dataset - name::String - X::Matrix{Float64} - y::Vector{Float64} - - function Dataset(name::String, X::AbstractMatrix, y::AbstractVector) - size(X, 1) == length(y) || - throw(ArgumentError("X and y must have same number of rows")) - - new(name, Matrix{Float64}(X), Vector{Float64}(y)) - end -end - -""" - one_hot_encode(Xdf::DataFrame; drop_first=true) - -One-hot encode categorical (string-like) features in `Xdf`. - -# Arguments -- `Xdf::DataFrame`: Input DataFrame containing features and response vector `y`. - -# Keyword Arguments -- `drop_first::Bool=true`: If `true`, drop the first dummy column for - each categorical feature to avoid multicollinearity. - -# Returns -- `Matrix{Float64}`: A numeric matrix containing the encoded feature. -""" -function one_hot_encode(Xdf::DataFrame; drop_first::Bool = true)::Matrix{Float64} - n = nrow(Xdf) - cols = Vector{Vector{Float64}}() - - for name in names(Xdf) #Selecting columns that aren't the target variable and pushing them to the columns. - col = Xdf[!, name] - if eltype(col) <: Real - push!(cols, Float64.(col)) - continue - end - - scol = string.(col) # Convert to string for categorical processing. - lv = unique(scol) #Get unique category levels. - ind = scol .== permutedims(lv) #Create indicator matrix for each level of the categorical variable. - #Permutedims is used to align the dimensions for broadcasting. - #Broadcasting compares each element of `scol` with each level in `lv`, resulting in a matrix where each column corresponds to a level and contains `true` for rows that match that level and `false` otherwise. - - if drop_first && size(ind, 2) > 1 #Drop the first column of the indicator matrix to avoid multicollinearity if drop_first is true and there are multiple levels. - ind = ind[:, 2:end] - end - - for j in 1:size(ind, 2) - push!(cols, Float64.(ind[:, j])) #Convert the boolean indicator columns to Float64 and add them to the list of columns. - end - end - - p = length(cols) - X = Matrix{Float64}(undef, n, p) - for j in 1:p - X[:, j] = cols[j] - end - - return Matrix{Float64}(X) - -end -""" - csv_dataset(path_or_url; target_col, name="csv_dataset") - -Load a dataset from a CSV file or URL. - -# Arguments -- `path_or_url::String`: Local file path or web URL containing CSV data. - -# Keyword Arguments -- `target_col`: Column index or column name containing the response variable. -- `name::String="csv_dataset"`: Dataset name. - -# Returns -- `Dataset`: A dataset containing the encoded feature matrix `X`, response vector `y`, and dataset name. -""" -function csv_dataset(path_or_url::String; - target_col, - name::String = "csv_dataset" -) - - filepath = - startswith(path_or_url, "http") ? - Downloads.download(path_or_url) : - path_or_url - - df = DataFrame(CSV.File(filepath)) #Read CSV file into a DataFrame. - df = dropmissing(df) #Remove rows with missing values. - Xdf = select(df, DataFrames.Not(target_col)) #Select all columns except the target column for features. - - y = target_col isa Int ? - df[:, target_col] : #If target_col is an integer, use it as a column index to extract the target variable from the DataFrame. - df[:, Symbol(target_col)] #Extract the target variable based on whether target_col is an index or a name. - - - X = one_hot_encode(Xdf; drop_first = true) - - - - return Dataset(name, Matrix{Float64}(X), Vector{Float64}(y)) -end diff --git a/test/dataset_tests.jl b/test/dataset_tests.jl deleted file mode 100644 index 5652c96..0000000 --- a/test/dataset_tests.jl +++ /dev/null @@ -1,66 +0,0 @@ -using Test -using DataFrames -using CSV - -using RidgeRegression -@testset "Dataset" begin - X = [1 2; 3 4] - y = [10, 20] - d = Dataset("toy", X, y) - - @test d.name == "toy" - @test d.X isa Matrix{Float64} - @test d.y isa Vector{Float64} - @test size(d.X) == (2, 2) - @test length(d.y) == 2 - @test d.X[1, 1] == 1.0 - @test d.y[2] == 20.0 - - @test_throws ArgumentError Dataset("bad", X, [1, 2, 3]) -end - -@testset "one_hot_encode" begin - df = DataFrame( - A = ["red", "blue", "red", "green"], - B = [1, 2, 3, 4], - C = ["small", "large", "medium", "small"] - ) - - X = redirect_stdout(devnull) do - one_hot_encode(df; drop_first = true) - end - - @test size(X) == (4, 5) - @test X[:, 3] == [1.0, 2.0, 3.0, 4.0] - @test all(x -> x == 0.0 || x == 1.0, X[:, [1,2,4,5]]) - @test all(vec(sum(X[:, 1:2]; dims=2)) .<= 1) - @test all(vec(sum(X[:, 4:5]; dims=2)) .<= 1) -end - -@testset "csv_dataset" begin - tmp = tempname() * ".csv" - df = DataFrame( - a = [1.0, 2.0, missing, 4.0], - b = ["x", "y", "y", "x"], - y = [10.0, 20.0, 30.0, 40.0] - ) - CSV.write(tmp, df) - - d = redirect_stdout(devnull) do - csv_dataset(tmp; target_col=:y, name="tmp") - end - - @test d.name == "tmp" - @test d.X isa Matrix{Float64} - @test d.y isa Vector{Float64} - - @test length(d.y) == 3 - @test size(d.X, 1) == 3 - @test d.y == [10.0, 20.0, 40.0] - - d2 = redirect_stdout(devnull) do - csv_dataset(tmp; target_col=3, name="tmp2") - end - @test d2.y == [10.0, 20.0, 40.0] - @test size(d2.X, 1) == 3 -end