Skip to content
11 changes: 10 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,16 @@
name = "RidgeRegression"
uuid = "739161c8-60e1-4c49-8f89-ff30998444b1"
authors = ["Vivak Patel <vp314@users.noreply.github.com>"]
version = "0.1.0"
authors = ["Eton Tackett <etont@icloud.com>", "Vivak Patel <vp314@users.noreply.github.com>"]

[deps]
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
RidgeRegression = "739161c8-60e1-4c49-8f89-ff30998444b1"

[compat]
CSV = "0.10.15"
DataFrames = "1.8.1"
Downloads = "1.7.0"
julia = "1.12.4"
1 change: 1 addition & 0 deletions docs/make.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ makedocs(;
),
pages=[
"Home" => "index.md",
"Design" => "design.md",
],
)

Expand Down
8 changes: 7 additions & 1 deletion src/RidgeRegression.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
module RidgeRegression

# Write your package code here.
using CSV
using DataFrames
using Downloads

include("dataset.jl")

export Dataset, load_csv_dataset, one_hot_encode

end
144 changes: 144 additions & 0 deletions src/dataset.jl
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All dependencies should appear in the Project.toml file. You should activate the package environment and then "add ..." your dependencies to ensure compatibility and correct environment for the package.

Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
"""
Dataset <: ExperimentalUnit
A dataset for Ridge Regression experiements.
# Description
A `Dataset` object stores the design matrix ``X`` and response vector ``y``
for a regression problem. These datasets serve as the experimental units for ridge regression experiments, allowing us to evaluate the performance of ridge regression models on various datasets.
# Fields
- `name::String`: Name of dataset
- `X::TX`: Matrix of variables/features
- `y::TY`: Target vector
# Constructor
Dataset(name::String, X::AbstractMatrix, y::AbstractVector)
## Arguments
- `name::String`: Name of dataset
- `X::TX`: Matrix of variables/features
- `y::TY`: Target vector
## Returns
- A `Dataset` object containing the numeric design matrix and response vector.
## Throws
- `ArgumentError`: If rows in `X` does not equal length of `y`.
Comment on lines +1 to +29
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There should be documentation for the struct being created and then there should be documentation for the constructor in the same docstring.

!!! note
`Dataset` objects are used as experimental units when evaluating
ridge regression algorithms. The parametric design allows both dense
and sparse matrices to be stored without forcing conversion to a
dense `Matrix{Float64}`.
"""
struct Dataset{TX<:AbstractMatrix, TY<:AbstractVector}
name::String
X::TX
y::TY

function Dataset(name::String, X::TX, y::TY) where {TX<:AbstractMatrix, TY<:AbstractVector}
size(X, 1) == length(y) ||
throw(ArgumentError("X and y must have same number of rows"))

new{TX, TY}(name, X, y)
end
end

"""
one_hot_encode(Xdf::DataFrame; drop_first=true)
One-hot encode categorical (string-like) features in `Xdf`.
# Arguments
- `Xdf::DataFrame`: Input DataFrame containing features and response vector `y`.
# Keyword Arguments
- `cols_to_encode`: A collection of column names or indices to one-hot encode.
- `drop_first::Bool=true`: If `true`, drop the first dummy column for
each categorical feature to avoid multicollinearity.
# Returns
- `Matrix{Float64}`: A numeric matrix containing the encoded feature.
"""
function one_hot_encode(Xdf::DataFrame; cols_to_encode, drop_first::Bool = true)::Matrix{Float64}
n = nrow(Xdf)
cols = Vector{Vector{Float64}}()
encode_names = Set(c isa Int ? Symbol(names(Xdf)[c]) : Symbol(c) for c in cols_to_encode)


for name in names(Xdf) #Selecting columns that aren't the target variable and pushing them to the columns.
col = Xdf[!, name]
name_sym = Symbol(name)
if name_sym in encode_names
scol = string.(col) # Convert to string for categorical processing.
lv = unique(scol) #Get unique category levels.
ind = scol .== permutedims(lv) #Create indicator matrix for each level of the categorical variable.
#Permutedims is used to align the dimensions for broadcasting.
#Broadcasting compares each element of `scol` with each level in `lv`, resulting in a matrix where each column corresponds to a level and contains `true` for rows that match that level and `false` otherwise.

if drop_first && size(ind, 2) > 1 #Drop the first column of the indicator matrix to avoid multicollinearity if drop_first is true and there are multiple levels.
ind = ind[:, 2:end]
end

for j in 1:size(ind, 2)
push!(cols, Float64.(ind[:, j])) #Convert the boolean indicator columns to Float64 and add them to the list of columns.
end
else
eltype(col) <: Real ||
throw(ArgumentError("Column $name must be numeric unless it is listed in cols_to_encode"))

push!(cols, Float64.(col))
end
end

p = length(cols)
X = Matrix{Float64}(undef, n, p)
for j in 1:p
X[:, j] = cols[j]
end

return Matrix{Float64}(X)

end
"""
load_csv_dataset(path_or_url; target_col, name="csv_dataset")
Load a dataset from a CSV file or URL.
# Arguments
- `path_or_url::String`: Local file path or web URL containing CSV data.
# Keyword Arguments
- `cols_to_encode=Symbol[]`: Column names or indices in the feature data to one-hot encode.
- `target_col`: Column index or column name containing the response variable.
- `name::String="csv_dataset"`: Dataset name.
# Returns
- `Dataset`: A dataset containing the encoded feature matrix `X`, response vector `y`, and dataset name.
"""
function load_csv_dataset(path_or_url::String; cols_to_encode=Symbol[], target_col, name::String = "csv_dataset")

filepath =
startswith(path_or_url, "http") ?
Downloads.download(path_or_url) :
path_or_url

df = DataFrame(CSV.File(filepath)) #Read CSV file into a DataFrame.
df = dropmissing(df) #Remove rows with missing values.
Xdf = select(df, DataFrames.Not(target_col)) #Select all columns except the target column for features.

y = target_col isa Int ?
df[:, target_col] : #If target_col is an integer, use it as a column index to extract the target variable from the DataFrame.
df[:, Symbol(target_col)] #Extract the target variable based on whether target_col is an index or a name.


feature_names = names(Xdf)
encode_cols = [c isa Int ? Symbol(names(Xdf)[c]) : Symbol(c) for c in cols_to_encode]
X = one_hot_encode(Xdf; cols_to_encode=encode_cols, drop_first = true)


return Dataset(name, X, collect(Float64, y))
end
7 changes: 7 additions & 0 deletions test/Project.toml
Original file line number Diff line number Diff line change
@@ -1,2 +1,9 @@
[deps]
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
RidgeRegression = "739161c8-60e1-4c49-8f89-ff30998444b1"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[compat]
CSV = "0.10"
DataFrames = "1"
19 changes: 19 additions & 0 deletions test/dataset_tests.jl
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Review unit testing documentation in Julia to see how to do this correctly.

Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
@testset "Testset 1" begin
X = [1 2; 3 4]
y = [10, 20]
d = Dataset("toy", X, y)

@test "toy" == d.name
@test X == d.X
@test y == d.y
@test (2, 2) == size(d.X)
@test 2 == length(d.y)
@test 1.0 == d.X[1, 1]
@test 20.0 == d.y[2]
end

@testset "Testset 2" begin
X = [1 2; 3 4]

@test_throws ArgumentError Dataset("bad", X, [1, 2, 3])
end
38 changes: 38 additions & 0 deletions test/encoding_tests.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
@testset "Testset 1" begin
df = DataFrame(
A = ["red", "blue", "red", "green"],
B = [1, 2, 3, 4],
C = ["small", "large", "medium", "small"]
)

X = one_hot_encode(df; cols_to_encode=[:A, :C], drop_first=true)

@test (4, 5) == size(X)
@test [1.0, 2.0, 3.0, 4.0] == X[:, 3]
@test all(x -> x == 0.0 || x == 1.0, X[:, [1, 2, 4, 5]])
@test all(vec(sum(X[:, 1:2]; dims=2)) .<= 1)
@test all(vec(sum(X[:, 4:5]; dims=2)) .<= 1)
end

@testset "Testset 2" begin
df = DataFrame(
A = ["red", "blue", "red", "green"],
B = [1, 2, 3, 4],
C = ["small", "large", "medium", "small"]
)

@test_throws ArgumentError one_hot_encode(df; cols_to_encode=[:A], drop_first=true)
end

@testset "Testset 3" begin
df = DataFrame(
group = [1, 2, 1, 3],
x = [10.0, 20.0, 30.0, 40.0]
)

X = one_hot_encode(df; cols_to_encode=[:group], drop_first=true)

@test (4, 3) == size(X)
@test [10.0, 20.0, 30.0, 40.0] == X[:, 3]
@test all(x -> x == 0.0 || x == 1.0, X[:, 1:2])
end
38 changes: 38 additions & 0 deletions test/load_csv_dataset_tests.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
@testset "Testset 1" begin
tmp = tempname() * ".csv"

df = DataFrame(
a = [1.0, 2.0, missing, 4.0],
b = ["x", "y", "y", "x"],
y = [10.0, 20.0, 30.0, 40.0]
)

CSV.write(tmp, df)

d = load_csv_dataset(tmp; target_col=:y, cols_to_encode=[:b], name="tmp")

@test "tmp" == d.name
@test 3 == length(d.y)
@test 3 == size(d.X, 1)
@test [10.0, 20.0, 40.0] == d.y
@test (3, 2) == size(d.X)
end

@testset "Testset 2" begin
tmp = tempname() * ".csv"

df = DataFrame(
a = [1.0, 2.0, missing, 4.0],
b = ["x", "y", "y", "x"],
y = [10.0, 20.0, 30.0, 40.0]
)

CSV.write(tmp, df)

d = load_csv_dataset(tmp; target_col=3, cols_to_encode=[:b], name="tmp2")

@test "tmp2" == d.name
@test [10.0, 20.0, 40.0] == d.y
@test 3 == size(d.X, 1)
@test (3, 2) == size(d.X)
end
14 changes: 13 additions & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,18 @@
using RidgeRegression
using Test
using DataFrames
using CSV

@testset "RidgeRegression.jl" begin
# Write your tests here.
@testset "Dataset tests" begin
include("dataset_tests.jl")
end

@testset "Encoding tests" begin
include("encoding_tests.jl")
end

@testset "Load CSV dataset tests" begin
include("load_csv_dataset_tests.jl")
end
end
Loading