Get started

library(mintyr)

split_cv

# Prepare example data: Convert first 3 columns of iris dataset to long format and split
dt_split <- w2l_split(data = iris, cols2l = 1:3)
# dt_split is now a list containing 3 data tables for Sepal.Length, Sepal.Width, and Petal.Length

# Example 1: Single cross-validation (no repeats)
split_cv(
  split_dt = dt_split,  # Input list of split data
  v = 3,                # Set 3-fold cross-validation
  repeats = 1           # Perform cross-validation once (no repeats)
)
#> $Sepal.Length
#>                         splits     id               train           validate
#>                         <list> <char>              <list>             <list>
#> 1: <vfold_split[100x50x150x3]>  Fold1 <data.table[100x3]> <data.table[50x3]>
#> 2: <vfold_split[100x50x150x3]>  Fold2 <data.table[100x3]> <data.table[50x3]>
#> 3: <vfold_split[100x50x150x3]>  Fold3 <data.table[100x3]> <data.table[50x3]>
#> 
#> $Sepal.Width
#>                         splits     id               train           validate
#>                         <list> <char>              <list>             <list>
#> 1: <vfold_split[100x50x150x3]>  Fold1 <data.table[100x3]> <data.table[50x3]>
#> 2: <vfold_split[100x50x150x3]>  Fold2 <data.table[100x3]> <data.table[50x3]>
#> 3: <vfold_split[100x50x150x3]>  Fold3 <data.table[100x3]> <data.table[50x3]>
#> 
#> $Petal.Length
#>                         splits     id               train           validate
#>                         <list> <char>              <list>             <list>
#> 1: <vfold_split[100x50x150x3]>  Fold1 <data.table[100x3]> <data.table[50x3]>
#> 2: <vfold_split[100x50x150x3]>  Fold2 <data.table[100x3]> <data.table[50x3]>
#> 3: <vfold_split[100x50x150x3]>  Fold3 <data.table[100x3]> <data.table[50x3]>
# Returns a list where each element contains:
# - splits: rsample split objects
# - id: fold numbers (Fold1, Fold2, Fold3)
# - train: training set data
# - validate: validation set data

# Example 2: Repeated cross-validation
split_cv(
  split_dt = dt_split,  # Input list of split data
  v = 3,                # Set 3-fold cross-validation
  repeats = 2           # Perform cross-validation twice
)
#> $Sepal.Length
#>                         splits      id    id2               train
#>                         <list>  <char> <char>              <list>
#> 1: <vfold_split[100x50x150x3]> Repeat1  Fold1 <data.table[100x3]>
#> 2: <vfold_split[100x50x150x3]> Repeat1  Fold2 <data.table[100x3]>
#> 3: <vfold_split[100x50x150x3]> Repeat1  Fold3 <data.table[100x3]>
#> 4: <vfold_split[100x50x150x3]> Repeat2  Fold1 <data.table[100x3]>
#> 5: <vfold_split[100x50x150x3]> Repeat2  Fold2 <data.table[100x3]>
#> 6: <vfold_split[100x50x150x3]> Repeat2  Fold3 <data.table[100x3]>
#>              validate
#>                <list>
#> 1: <data.table[50x3]>
#> 2: <data.table[50x3]>
#> 3: <data.table[50x3]>
#> 4: <data.table[50x3]>
#> 5: <data.table[50x3]>
#> 6: <data.table[50x3]>
#> 
#> $Sepal.Width
#>                         splits      id    id2               train
#>                         <list>  <char> <char>              <list>
#> 1: <vfold_split[100x50x150x3]> Repeat1  Fold1 <data.table[100x3]>
#> 2: <vfold_split[100x50x150x3]> Repeat1  Fold2 <data.table[100x3]>
#> 3: <vfold_split[100x50x150x3]> Repeat1  Fold3 <data.table[100x3]>
#> 4: <vfold_split[100x50x150x3]> Repeat2  Fold1 <data.table[100x3]>
#> 5: <vfold_split[100x50x150x3]> Repeat2  Fold2 <data.table[100x3]>
#> 6: <vfold_split[100x50x150x3]> Repeat2  Fold3 <data.table[100x3]>
#>              validate
#>                <list>
#> 1: <data.table[50x3]>
#> 2: <data.table[50x3]>
#> 3: <data.table[50x3]>
#> 4: <data.table[50x3]>
#> 5: <data.table[50x3]>
#> 6: <data.table[50x3]>
#> 
#> $Petal.Length
#>                         splits      id    id2               train
#>                         <list>  <char> <char>              <list>
#> 1: <vfold_split[100x50x150x3]> Repeat1  Fold1 <data.table[100x3]>
#> 2: <vfold_split[100x50x150x3]> Repeat1  Fold2 <data.table[100x3]>
#> 3: <vfold_split[100x50x150x3]> Repeat1  Fold3 <data.table[100x3]>
#> 4: <vfold_split[100x50x150x3]> Repeat2  Fold1 <data.table[100x3]>
#> 5: <vfold_split[100x50x150x3]> Repeat2  Fold2 <data.table[100x3]>
#> 6: <vfold_split[100x50x150x3]> Repeat2  Fold3 <data.table[100x3]>
#>              validate
#>                <list>
#> 1: <data.table[50x3]>
#> 2: <data.table[50x3]>
#> 3: <data.table[50x3]>
#> 4: <data.table[50x3]>
#> 5: <data.table[50x3]>
#> 6: <data.table[50x3]>
# Returns a list where each element contains:
# - splits: rsample split objects
# - id: repeat numbers (Repeat1, Repeat2)
# - id2: fold numbers (Fold1, Fold2, Fold3)
# - train: training set data
# - validate: validation set data

c2p_nest

# Example data preparation: Define column names for combination
col_names <- c("Sepal.Length", "Sepal.Width", "Petal.Length")

# Example 1: Basic column-to-pairs nesting with custom separator
c2p_nest(
  iris,                   # Input iris dataset
  cols2bind = col_names,  # Columns to be combined as pairs
  pairs_n = 2,            # Create pairs of 2 columns
  sep = "&"               # Custom separator for pair names
)
#>                        pairs                data
#>                       <char>              <list>
#> 1:  Sepal.Length&Sepal.Width <data.table[150x4]>
#> 2: Sepal.Length&Petal.Length <data.table[150x4]>
#> 3:  Sepal.Width&Petal.Length <data.table[150x4]>
# Returns a nested data.table where:
# - pairs: combined column names (e.g., "Sepal.Length&Sepal.Width")
# - data: list column containing data.tables with value1, value2 columns

# Example 2: Column-to-pairs nesting with numeric indices and grouping
c2p_nest(
  iris,                   # Input iris dataset
  cols2bind = 1:3,        # First 3 columns to be combined
  pairs_n = 2,            # Create pairs of 2 columns
  by = 5                  # Group by 5th column (Species)
)
#>                        pairs    Species               data
#>                       <char>     <fctr>             <list>
#> 1:  Sepal.Length-Sepal.Width     setosa <data.table[50x3]>
#> 2:  Sepal.Length-Sepal.Width versicolor <data.table[50x3]>
#> 3:  Sepal.Length-Sepal.Width  virginica <data.table[50x3]>
#> 4: Sepal.Length-Petal.Length     setosa <data.table[50x3]>
#> 5: Sepal.Length-Petal.Length versicolor <data.table[50x3]>
#> 6: Sepal.Length-Petal.Length  virginica <data.table[50x3]>
#> 7:  Sepal.Width-Petal.Length     setosa <data.table[50x3]>
#> 8:  Sepal.Width-Petal.Length versicolor <data.table[50x3]>
#> 9:  Sepal.Width-Petal.Length  virginica <data.table[50x3]>
# Returns a nested data.table where:
# - pairs: combined column names
# - Species: grouping variable
# - data: list column containing data.tables grouped by Species

r2p_nest

# Example 1: Row-to-pairs nesting with column names
r2p_nest(
  mtcars,                     # Input mtcars dataset
  rows2bind = "cyl",          # Column to be used as row values
  by = c("hp", "drat", "wt")  # Columns to be transformed into pairs
)
#>      name                data
#>    <fctr>              <list>
#> 1:     hp <data.table[32x12]>
#> 2:   drat <data.table[32x12]>
#> 3:     wt <data.table[32x12]>
# Returns a nested data.table where:
# - name: variable names (hp, drat, wt)
# - data: list column containing data.tables with rows grouped by cyl values

# Example 2: Row-to-pairs nesting with numeric indices
r2p_nest(
  mtcars,                     # Input mtcars dataset
  rows2bind = 2,              # Use 2nd column (cyl) as row values
  by = 4:6                    # Use columns 4-6 (hp, drat, wt) for pairs
)
#>      name                data
#>    <fctr>              <list>
#> 1:     hp <data.table[32x12]>
#> 2:   drat <data.table[32x12]>
#> 3:     wt <data.table[32x12]>
# Returns a nested data.table where:
# - name: variable names from columns 4-6
# - data: list column containing data.tables with rows grouped by cyl values

export_nest

# Example 1: Basic nested data export workflow
# Step 1: Create nested data structure
dt_nest <- w2l_nest(
  data = iris,              # Input iris dataset
  cols2l = 1:2,             # Columns to be nested
  by = "Species"            # Grouping variable
)

# Step 2: Export nested data to files
export_nest(
  nest_dt = dt_nest,        # Input nested data.table
  nest_col = "data",        # Column containing nested data
  group_cols = c("name", "Species")  # Columns to create directory structure
)
#> [1] 6
# Returns the number of files created
# Creates directory structure: tempdir()/name/Species/data.txt

# Check exported files
list.files(
  path = tempdir(),         # Default export directory
  pattern = "txt",          # File type pattern to search
  recursive = TRUE          # Search in subdirectories
)
#> [1] "Sepal.Length/setosa/data.txt"     "Sepal.Length/versicolor/data.txt"
#> [3] "Sepal.Length/virginica/data.txt"  "Sepal.Width/setosa/data.txt"     
#> [5] "Sepal.Width/versicolor/data.txt"  "Sepal.Width/virginica/data.txt"
# Returns list of created files and their paths

# Clean up exported files
files <- list.files(
  path = tempdir(),         # Default export directory
  pattern = "txt",          # File type pattern to search
  recursive = TRUE,         # Search in subdirectories
  full.names = TRUE         # Return full file paths
)
file.remove(files)          # Remove all exported files
#> [1] TRUE TRUE TRUE TRUE TRUE TRUE

export_list

# Example: Export split data to files

# Step 1: Create split data structure
dt_split <- w2l_split(
  data = iris,              # Input iris dataset
  cols2l = 1:2,             # Columns to be split
  by = "Species"            # Grouping variable
)

# Step 2: Export split data to files
export_list(
  split_dt = dt_split       # Input list of data.tables
)
#> [1] 6
# Returns the number of files created
# Files are saved in tempdir() with .txt extension

# Check exported files
list.files(
  path = tempdir(),         # Default export directory
  pattern = "txt",          # File type pattern to search
  recursive = TRUE          # Search in subdirectories
)
#> [1] "Sepal.Length_setosa.txt"     "Sepal.Length_versicolor.txt"
#> [3] "Sepal.Length_virginica.txt"  "Sepal.Width_setosa.txt"     
#> [5] "Sepal.Width_versicolor.txt"  "Sepal.Width_virginica.txt"

# Clean up exported files
files <- list.files(
  path = tempdir(),         # Default export directory
  pattern = "txt",          # File type pattern to search
  recursive = TRUE,         # Search in subdirectories
  full.names = TRUE         # Return full file paths
)
file.remove(files)          # Remove all exported files
#> [1] TRUE TRUE TRUE TRUE TRUE TRUE

fires

head(fires())
#>    Location   Tag       Date    Entry     Exit Ent Wt Ext Wt Consumed Weight
#>       <int> <int>     <char>   <char>   <char>  <num>  <num>    <num>  <num>
#> 1:      101 35877 2024-10-16 14:15:39 14:18:02  0.678  0.632    0.046   67.6
#> 2:      101 35873 2024-10-16 14:18:03 14:23:05  0.632  0.384    0.248   60.8
#> 3:      101 35878 2024-10-16 14:23:15 14:28:45  0.670  0.469    0.201   70.8
#> 4:      101 35855 2024-10-16 14:29:05 14:34:29  0.755  0.634    0.121   51.2
#> 5:      101 35877 2024-10-16 14:34:30 14:34:37  0.634  0.634    0.000    0.0
#> 6:      101 35853 2024-10-16 14:34:38 14:36:26  0.634  0.634    0.000   88.6
#>    Topup Amount
#>           <num>
#> 1:        0.286
#> 2:        0.000
#> 3:        0.286
#> 4:        0.286
#> 5:        0.000
#> 6:        0.000

nedaps

head(nedaps())
#>    animal_number lifenumber responder location          visit_time duration
#>            <int>     <lgcl>     <int>    <int>              <POSc>    <int>
#> 1:      10115497         NA     15497      101 2024-09-15 20:22:51        3
#> 2:      10115967         NA     15967      101 2024-09-15 20:22:54       65
#> 3:      10115983         NA     15983      101 2024-09-15 20:23:59        2
#> 4:      10115967         NA     15967      101 2024-09-15 20:24:01       11
#> 5:      10115983         NA     15983      101 2024-09-15 20:24:12        2
#> 6:      10115967         NA     15967      101 2024-09-15 20:24:14       33
#>    state weight feed_intake
#>    <int>  <int>       <int>
#> 1:     0  46500           0
#> 2:     0  22000          17
#> 3:     0  33000           0
#> 4:     0  33500           0
#> 5:     0  35500           0
#> 6:     0  31000           0

convert_nest

# Example 1: Create nested data structures
# Create single nested column
df_nest1 <- iris |> 
  dplyr::group_nest(Species)     # Group and nest by Species

# Create multiple nested columns
df_nest2 <- iris |>
  dplyr::group_nest(Species) |>  # Group and nest by Species
  dplyr::mutate(
    data2 = purrr::map(          # Create second nested column
      data,
      dplyr::mutate, 
      c = 2
    )
  )

# Example 2: Convert nested structures
# Convert data frame to data table
convert_nest(
  df_nest1,                      # Input nested data frame
  to = "dt"                      # Convert to data.table
)
#>       Species               data
#>        <fctr>             <list>
#> 1:     setosa <data.table[50x4]>
#> 2: versicolor <data.table[50x4]>
#> 3:  virginica <data.table[50x4]>

# Convert specific nested columns
convert_nest(
  df_nest2,                      # Input nested data frame
  to = "dt",                     # Convert to data.table
  nest_cols = "data"             # Only convert 'data' column
)
#> Warning: `...` must be empty in `format.tbl()`
#> `...` must be empty in `format.tbl()`
#> `...` must be empty in `format.tbl()`
#> Caused by error in `format_tbl()`:
#> ! `...` must be empty.
#> ✖ Problematic arguments:
#> • na.encode = FALSE
#> • timezone = timezone
#> • justify = justify
#>       Species               data          data2
#>        <fctr>             <list>         <list>
#> 1:     setosa <data.table[50x4]> <tbl_df[50x5]>
#> 2: versicolor <data.table[50x4]> <tbl_df[50x5]>
#> 3:  virginica <data.table[50x4]> <tbl_df[50x5]>

# Example 3: Convert data table to data frame
dt_nest <- mintyr::w2l_nest(
  data = iris,                   # Input dataset
  cols2l = 1:2                   # Columns to nest
)
convert_nest(
  dt_nest,                       # Input nested data table
  to = "df"                      # Convert to data frame
)
#> # A tibble: 2 × 2
#>   name         data              
#>   <fct>        <list>            
#> 1 Sepal.Length <tibble [150 × 4]>
#> 2 Sepal.Width  <tibble [150 × 4]>

get_path_segment

# Example: Path segment extraction demonstrations

# Setup test paths
paths <- c(
  "C:/home/user/documents",   # Windows style path
  "/var/log/system",          # Unix system path
  "/usr/local/bin"            # Unix binary path
)

# Example 1: Extract first segment
get_path_segment(
  paths,                      # Input paths
  1                           # Get first segment
)
#> [1] "home" "var"  "usr"
# Returns: c("home", "var", "usr")

# Example 2: Extract second-to-last segment
get_path_segment(
  paths,                      # Input paths
  -2                          # Get second-to-last segment
)
#> [1] "user"  "log"   "local"
# Returns: c("user", "log", "local")

# Example 3: Extract from first to last segment
get_path_segment(
  paths,                      # Input paths
  c(1,-1)                     # Range from first to last
)
#> [1] "home/user/documents" "var/log/system"      "usr/local/bin"
# Returns full paths without drive letters

# Example 4: Extract first three segments
get_path_segment(
  paths,                      # Input paths
  c(1,3)                      # Range from first to third
)
#> [1] "home/user/documents" "var/log/system"      "usr/local/bin"
# Returns: c("home/user/documents", "var/log/system", "usr/local/bin")

# Example 5: Extract last two segments (reverse order)
get_path_segment(
  paths,                      # Input paths
  c(-1,-2)                    # Range from last to second-to-last
)
#> [1] "user/documents" "log/system"     "local/bin"
# Returns: c("documents/user", "system/log", "bin/local")

# Example 6: Extract first two segments
get_path_segment(
  paths,                      # Input paths
  c(1,2)                      # Range from first to second
)
#> [1] "home/user" "var/log"   "usr/local"
# Returns: c("home/user", "var/log", "usr/local")

format_digits

# Example: Number formatting demonstrations

# Setup test data
dt <- data.table::data.table(
  a = c(0.1234, 0.5678),      # Numeric column 1
  b = c(0.2345, 0.6789),      # Numeric column 2
  c = c("text1", "text2")     # Text column
)

# Example 1: Format all numeric columns
format_digits(
  dt,                         # Input data table
  digits = 2                  # Round to 2 decimal places
)
#>         a      b      c
#>    <char> <char> <char>
#> 1:   0.12   0.23  text1
#> 2:   0.57   0.68  text2

# Example 2: Format specific column as percentage
format_digits(
  dt,                         # Input data table
  cols = c("a"),              # Only format column 'a'
  digits = 2,                 # Round to 2 decimal places
  percentage = TRUE           # Convert to percentage
)
#>         a      b      c
#>    <char>  <num> <char>
#> 1: 12.34% 0.2345  text1
#> 2: 56.78% 0.6789  text2

mintyr_example

# Get path to an example file
mintyr_example("csv_test1.csv")
#> [1] "/tmp/RtmpLmavH5/Rinst13cb4c827134/mintyr/extdata/csv_test1.csv"

mintyr_examples

# List all example files
mintyr_examples()
#> [1] "csv_test1.csv"   "csv_test2.csv"   "xlsx_test1.xlsx" "xlsx_test2.xlsx"

import_xlsx

# Example: Excel file import demonstrations

# Setup test files
xlsx_files <- mintyr_example(
  mintyr_examples("xlsx_test")    # Get example Excel files
)

# Example 1: Import and combine all sheets from all files
import_xlsx(
  xlsx_files,                     # Input Excel file paths
  rbind = TRUE                    # Combine all sheets into one data.table
)
#>     excel_name sheet_name  col1   col2   col3
#>         <char>     <char> <num> <char> <lgcl>
#>  1: xlsx_test1     Sheet1     4      d  FALSE
#>  2: xlsx_test1     Sheet1     5      f   TRUE
#>  3: xlsx_test1     Sheet1     6      e   TRUE
#>  4: xlsx_test1     Sheet2     1      a   TRUE
#>  5: xlsx_test1     Sheet2     2      b  FALSE
#>  6: xlsx_test1     Sheet2     3      c   TRUE
#>  7: xlsx_test2     Sheet1    15      o  FALSE
#>  8: xlsx_test2     Sheet1    16      p   TRUE
#>  9: xlsx_test2     Sheet1    17      q  FALSE
#> 10: xlsx_test2          a     7      g  FALSE
#> 11: xlsx_test2          a     9      h   TRUE
#> 12: xlsx_test2          a     8      i  FALSE
#> 13: xlsx_test2          b    10      J  FALSE
#> 14: xlsx_test2          b    11      K   TRUE
#> 15: xlsx_test2          b    12      L  FALSE

# Example 2: Import specific sheets separately
import_xlsx(
  xlsx_files,                     # Input Excel file paths
  rbind = FALSE,                  # Keep sheets as separate data.tables
  sheet = 2                       # Only import first sheet
)
#> $xlsx_test1_Sheet2
#>     col1   col2   col3
#>    <num> <char> <lgcl>
#> 1:     1      a   TRUE
#> 2:     2      b  FALSE
#> 3:     3      c   TRUE
#> 
#> $xlsx_test2_a
#>     col1   col2   col3
#>    <num> <char> <lgcl>
#> 1:     7      g  FALSE
#> 2:     9      h   TRUE
#> 3:     8      i  FALSE

import_csv

# Example: CSV file import demonstrations

# Setup test files
csv_files <- mintyr_example(
  mintyr_examples("csv_test")     # Get example CSV files
)

# Example 1: Import and combine CSV files using data.table
import_csv(
  csv_files,                      # Input CSV file paths
  package = "data.table",         # Use data.table for reading
  rbind = TRUE,                   # Combine all files into one data.table
  rbind_label = "_file"           # Column name for file source
)
#>        _file  col1   col2   col3
#>       <char> <int> <char> <lgcl>
#> 1: csv_test1     4      d  FALSE
#> 2: csv_test1     5      f   TRUE
#> 3: csv_test1     6      e   TRUE
#> 4: csv_test2    15      o  FALSE
#> 5: csv_test2    16      p   TRUE
#> 6: csv_test2    17      q  FALSE

# Example 2: Import files separately using arrow
import_csv(
  csv_files,                      # Input CSV file paths
  package = "arrow",              # Use arrow for reading
  rbind = FALSE                   # Keep files as separate data.tables
)
#> $csv_test1
#> # A tibble: 3 × 3
#>    col1 col2  col3 
#>   <int> <chr> <lgl>
#> 1     4 d     FALSE
#> 2     5 f     TRUE 
#> 3     6 e     TRUE 
#> 
#> $csv_test2
#> # A tibble: 3 × 3
#>    col1 col2  col3 
#>   <int> <chr> <lgl>
#> 1    15 o     FALSE
#> 2    16 p     TRUE 
#> 3    17 q     FALSE

get_filename

# Example: File path processing demonstrations

# Setup test files
xlsx_files <- mintyr_example(
  mintyr_examples("xlsx_test")    # Get example Excel files
)

# Example 1: Extract filenames without extensions
get_filename(
  xlsx_files,                     # Input file paths
  rm_extension = TRUE,            # Remove file extensions
  rm_path = TRUE                  # Remove directory paths
)
#> [1] "xlsx_test1" "xlsx_test2"

# Example 2: Keep file extensions
get_filename(
  xlsx_files,                     # Input file paths
  rm_extension = FALSE,           # Keep file extensions
  rm_path = TRUE                  # Remove directory paths
)
#> [1] "xlsx_test1.xlsx" "xlsx_test2.xlsx"

# Example 3: Keep full paths without extensions
get_filename(
  xlsx_files,                     # Input file paths
  rm_extension = TRUE,            # Remove file extensions
  rm_path = FALSE                 # Keep directory paths
)
#> [1] "/tmp/RtmpLmavH5/Rinst13cb4c827134/mintyr/extdata/xlsx_test1"
#> [2] "/tmp/RtmpLmavH5/Rinst13cb4c827134/mintyr/extdata/xlsx_test2"

w2l_nest

# Example: Wide to long format nesting demonstrations

# Example 1: Basic nesting by group
w2l_nest(
  data = iris,                    # Input dataset
  by = "Species"                  # Group by Species column
)
#>       Species               data
#>        <fctr>             <list>
#> 1:     setosa <data.table[50x4]>
#> 2: versicolor <data.table[50x4]>
#> 3:  virginica <data.table[50x4]>

# Example 2: Nest specific columns with numeric indices
w2l_nest(
  data = iris,                    # Input dataset
  cols2l = 1:4,                   # Select first 4 columns to nest
  by = "Species"                  # Group by Species column
)
#>             name    Species               data
#>           <fctr>     <fctr>             <list>
#>  1: Sepal.Length     setosa <data.table[50x1]>
#>  2: Sepal.Length versicolor <data.table[50x1]>
#>  3: Sepal.Length  virginica <data.table[50x1]>
#>  4:  Sepal.Width     setosa <data.table[50x1]>
#>  5:  Sepal.Width versicolor <data.table[50x1]>
#>  6:  Sepal.Width  virginica <data.table[50x1]>
#>  7: Petal.Length     setosa <data.table[50x1]>
#>  8: Petal.Length versicolor <data.table[50x1]>
#>  9: Petal.Length  virginica <data.table[50x1]>
#> 10:  Petal.Width     setosa <data.table[50x1]>
#> 11:  Petal.Width versicolor <data.table[50x1]>
#> 12:  Petal.Width  virginica <data.table[50x1]>

# Example 3: Nest specific columns with column names
w2l_nest(
  data = iris,                    # Input dataset
  cols2l = c("Sepal.Length",      # Select columns by name
             "Sepal.Width", 
             "Petal.Length"),
  by = 5                          # Group by column index 5 (Species)
)
#>            name    Species               data
#>          <fctr>     <fctr>             <list>
#> 1: Sepal.Length     setosa <data.table[50x2]>
#> 2: Sepal.Length versicolor <data.table[50x2]>
#> 3: Sepal.Length  virginica <data.table[50x2]>
#> 4:  Sepal.Width     setosa <data.table[50x2]>
#> 5:  Sepal.Width versicolor <data.table[50x2]>
#> 6:  Sepal.Width  virginica <data.table[50x2]>
#> 7: Petal.Length     setosa <data.table[50x2]>
#> 8: Petal.Length versicolor <data.table[50x2]>
#> 9: Petal.Length  virginica <data.table[50x2]>
# Returns similar structure to Example 2

w2l_split

# Example: Wide to long format splitting demonstrations

# Example 1: Basic splitting by Species
w2l_split(
  data = iris,                    # Input dataset
  by = "Species"                  # Split by Species column
) |> 
  lapply(head)                    # Show first 6 rows of each split
#> $setosa
#>    Sepal.Length Sepal.Width Petal.Length Petal.Width
#>           <num>       <num>        <num>       <num>
#> 1:          5.1         3.5          1.4         0.2
#> 2:          4.9         3.0          1.4         0.2
#> 3:          4.7         3.2          1.3         0.2
#> 4:          4.6         3.1          1.5         0.2
#> 5:          5.0         3.6          1.4         0.2
#> 6:          5.4         3.9          1.7         0.4
#> 
#> $versicolor
#>    Sepal.Length Sepal.Width Petal.Length Petal.Width
#>           <num>       <num>        <num>       <num>
#> 1:          7.0         3.2          4.7         1.4
#> 2:          6.4         3.2          4.5         1.5
#> 3:          6.9         3.1          4.9         1.5
#> 4:          5.5         2.3          4.0         1.3
#> 5:          6.5         2.8          4.6         1.5
#> 6:          5.7         2.8          4.5         1.3
#> 
#> $virginica
#>    Sepal.Length Sepal.Width Petal.Length Petal.Width
#>           <num>       <num>        <num>       <num>
#> 1:          6.3         3.3          6.0         2.5
#> 2:          5.8         2.7          5.1         1.9
#> 3:          7.1         3.0          5.9         2.1
#> 4:          6.3         2.9          5.6         1.8
#> 5:          6.5         3.0          5.8         2.2
#> 6:          7.6         3.0          6.6         2.1

# Example 2: Split specific columns using numeric indices
w2l_split(
  data = iris,                    # Input dataset
  cols2l = 1:3,                   # Select first 3 columns to split
  by = 5                          # Split by column index 5 (Species)
) |> 
  lapply(head)                    # Show first 6 rows of each split
#> $Sepal.Length_setosa
#>    Petal.Width value
#>          <num> <num>
#> 1:         0.2   5.1
#> 2:         0.2   4.9
#> 3:         0.2   4.7
#> 4:         0.2   4.6
#> 5:         0.2   5.0
#> 6:         0.4   5.4
#> 
#> $Sepal.Length_versicolor
#>    Petal.Width value
#>          <num> <num>
#> 1:         1.4   7.0
#> 2:         1.5   6.4
#> 3:         1.5   6.9
#> 4:         1.3   5.5
#> 5:         1.5   6.5
#> 6:         1.3   5.7
#> 
#> $Sepal.Length_virginica
#>    Petal.Width value
#>          <num> <num>
#> 1:         2.5   6.3
#> 2:         1.9   5.8
#> 3:         2.1   7.1
#> 4:         1.8   6.3
#> 5:         2.2   6.5
#> 6:         2.1   7.6
#> 
#> $Sepal.Width_setosa
#>    Petal.Width value
#>          <num> <num>
#> 1:         0.2   3.5
#> 2:         0.2   3.0
#> 3:         0.2   3.2
#> 4:         0.2   3.1
#> 5:         0.2   3.6
#> 6:         0.4   3.9
#> 
#> $Sepal.Width_versicolor
#>    Petal.Width value
#>          <num> <num>
#> 1:         1.4   3.2
#> 2:         1.5   3.2
#> 3:         1.5   3.1
#> 4:         1.3   2.3
#> 5:         1.5   2.8
#> 6:         1.3   2.8
#> 
#> $Sepal.Width_virginica
#>    Petal.Width value
#>          <num> <num>
#> 1:         2.5   3.3
#> 2:         1.9   2.7
#> 3:         2.1   3.0
#> 4:         1.8   2.9
#> 5:         2.2   3.0
#> 6:         2.1   3.0
#> 
#> $Petal.Length_setosa
#>    Petal.Width value
#>          <num> <num>
#> 1:         0.2   1.4
#> 2:         0.2   1.4
#> 3:         0.2   1.3
#> 4:         0.2   1.5
#> 5:         0.2   1.4
#> 6:         0.4   1.7
#> 
#> $Petal.Length_versicolor
#>    Petal.Width value
#>          <num> <num>
#> 1:         1.4   4.7
#> 2:         1.5   4.5
#> 3:         1.5   4.9
#> 4:         1.3   4.0
#> 5:         1.5   4.6
#> 6:         1.3   4.5
#> 
#> $Petal.Length_virginica
#>    Petal.Width value
#>          <num> <num>
#> 1:         2.5   6.0
#> 2:         1.9   5.1
#> 3:         2.1   5.9
#> 4:         1.8   5.6
#> 5:         2.2   5.8
#> 6:         2.1   6.6

# Example 3: Split specific columns using column names
list_res <- w2l_split(
  data = iris,                    # Input dataset
  cols2l = c("Sepal.Length",      # Select columns by name
             "Sepal.Width"),
  by = "Species"                  # Split by Species column
)
lapply(list_res, head)            # Show first 6 rows of each split
#> $Sepal.Length_setosa
#>    Petal.Length Petal.Width value
#>           <num>       <num> <num>
#> 1:          1.4         0.2   5.1
#> 2:          1.4         0.2   4.9
#> 3:          1.3         0.2   4.7
#> 4:          1.5         0.2   4.6
#> 5:          1.4         0.2   5.0
#> 6:          1.7         0.4   5.4
#> 
#> $Sepal.Length_versicolor
#>    Petal.Length Petal.Width value
#>           <num>       <num> <num>
#> 1:          4.7         1.4   7.0
#> 2:          4.5         1.5   6.4
#> 3:          4.9         1.5   6.9
#> 4:          4.0         1.3   5.5
#> 5:          4.6         1.5   6.5
#> 6:          4.5         1.3   5.7
#> 
#> $Sepal.Length_virginica
#>    Petal.Length Petal.Width value
#>           <num>       <num> <num>
#> 1:          6.0         2.5   6.3
#> 2:          5.1         1.9   5.8
#> 3:          5.9         2.1   7.1
#> 4:          5.6         1.8   6.3
#> 5:          5.8         2.2   6.5
#> 6:          6.6         2.1   7.6
#> 
#> $Sepal.Width_setosa
#>    Petal.Length Petal.Width value
#>           <num>       <num> <num>
#> 1:          1.4         0.2   3.5
#> 2:          1.4         0.2   3.0
#> 3:          1.3         0.2   3.2
#> 4:          1.5         0.2   3.1
#> 5:          1.4         0.2   3.6
#> 6:          1.7         0.4   3.9
#> 
#> $Sepal.Width_versicolor
#>    Petal.Length Petal.Width value
#>           <num>       <num> <num>
#> 1:          4.7         1.4   3.2
#> 2:          4.5         1.5   3.2
#> 3:          4.9         1.5   3.1
#> 4:          4.0         1.3   2.3
#> 5:          4.6         1.5   2.8
#> 6:          4.5         1.3   2.8
#> 
#> $Sepal.Width_virginica
#>    Petal.Length Petal.Width value
#>           <num>       <num> <num>
#> 1:          6.0         2.5   3.3
#> 2:          5.1         1.9   2.7
#> 3:          5.9         2.1   3.0
#> 4:          5.6         1.8   2.9
#> 5:          5.8         2.2   3.0
#> 6:          6.6         2.1   3.0
# Returns similar structure to Example 2

nest_cv

# Example: Cross-validation for nested data.table demonstrations

# Setup test data
dt_nest <- w2l_nest(
  data = iris,                   # Input dataset
  cols2l = 1:2                   # Nest first 2 columns
)

# Example 1: Basic 2-fold cross-validation
nest_cv(
  nest_dt = dt_nest,             # Input nested data.table
  v = 2                          # Number of folds (2-fold CV)
)
#>            name                     splits     id              train
#>          <fctr>                     <list> <char>             <list>
#> 1: Sepal.Length <vfold_split[75x75x150x4]>  Fold1 <data.table[75x4]>
#> 2: Sepal.Length <vfold_split[75x75x150x4]>  Fold2 <data.table[75x4]>
#> 3:  Sepal.Width <vfold_split[75x75x150x4]>  Fold1 <data.table[75x4]>
#> 4:  Sepal.Width <vfold_split[75x75x150x4]>  Fold2 <data.table[75x4]>
#>              validate
#>                <list>
#> 1: <data.table[75x4]>
#> 2: <data.table[75x4]>
#> 3: <data.table[75x4]>
#> 4: <data.table[75x4]>

# Example 2: Repeated 2-fold cross-validation
nest_cv(
  nest_dt = dt_nest,             # Input nested data.table
  v = 2,                         # Number of folds (2-fold CV)
  repeats = 2                    # Number of repetitions
)
#>            name                     splits      id    id2              train
#>          <fctr>                     <list>  <char> <char>             <list>
#> 1: Sepal.Length <vfold_split[75x75x150x4]> Repeat1  Fold1 <data.table[75x4]>
#> 2: Sepal.Length <vfold_split[75x75x150x4]> Repeat1  Fold2 <data.table[75x4]>
#> 3: Sepal.Length <vfold_split[75x75x150x4]> Repeat2  Fold1 <data.table[75x4]>
#> 4: Sepal.Length <vfold_split[75x75x150x4]> Repeat2  Fold2 <data.table[75x4]>
#> 5:  Sepal.Width <vfold_split[75x75x150x4]> Repeat1  Fold1 <data.table[75x4]>
#> 6:  Sepal.Width <vfold_split[75x75x150x4]> Repeat1  Fold2 <data.table[75x4]>
#> 7:  Sepal.Width <vfold_split[75x75x150x4]> Repeat2  Fold1 <data.table[75x4]>
#> 8:  Sepal.Width <vfold_split[75x75x150x4]> Repeat2  Fold2 <data.table[75x4]>
#>              validate
#>                <list>
#> 1: <data.table[75x4]>
#> 2: <data.table[75x4]>
#> 3: <data.table[75x4]>
#> 4: <data.table[75x4]>
#> 5: <data.table[75x4]>
#> 6: <data.table[75x4]>
#> 7: <data.table[75x4]>
#> 8: <data.table[75x4]>

top_perc

# Example 1: Basic usage with single trait
# This example selects the top 10% of observations based on Petal.Width
# keep_data=TRUE returns both summary statistics and the filtered data
top_perc(iris, 
         perc = 0.1,                # Select top 10%
         trait = c("Petal.Width"),  # Column to analyze
         keep_data = TRUE)          # Return both stats and filtered data
#> $Petal.Width_0.1
#> $Petal.Width_0.1$stat
#> # A tibble: 1 × 5
#>   variable        n  mean    sd top_perc
#>   <fct>       <dbl> <dbl> <dbl> <chr>   
#> 1 Petal.Width    17  2.34   0.1 10%     
#> 
#> $Petal.Width_0.1$data
#>    Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
#> 1           6.3         3.3          6.0         2.5 virginica
#> 2           6.5         3.0          5.8         2.2 virginica
#> 3           7.2         3.6          6.1         2.5 virginica
#> 4           5.8         2.8          5.1         2.4 virginica
#> 5           6.4         3.2          5.3         2.3 virginica
#> 6           7.7         3.8          6.7         2.2 virginica
#> 7           7.7         2.6          6.9         2.3 virginica
#> 8           6.9         3.2          5.7         2.3 virginica
#> 9           6.4         2.8          5.6         2.2 virginica
#> 10          7.7         3.0          6.1         2.3 virginica
#> 11          6.3         3.4          5.6         2.4 virginica
#> 12          6.7         3.1          5.6         2.4 virginica
#> 13          6.9         3.1          5.1         2.3 virginica
#> 14          6.8         3.2          5.9         2.3 virginica
#> 15          6.7         3.3          5.7         2.5 virginica
#> 16          6.7         3.0          5.2         2.3 virginica
#> 17          6.2         3.4          5.4         2.3 virginica

# Example 2: Using grouping with 'by' parameter
# This example performs the same analysis but separately for each Species
# Returns nested list with stats and filtered data for each group
top_perc(iris, 
         perc = 0.1,                # Select top 10%
         trait = c("Petal.Width"),  # Column to analyze
         by = "Species")            # Group by Species
#> # A tibble: 3 × 6
#>   Species    variable        n  mean    sd top_perc
#>   <fct>      <fct>       <dbl> <dbl> <dbl> <chr>   
#> 1 setosa     Petal.Width     9 0.433 0.071 10%     
#> 2 versicolor Petal.Width     5 1.66  0.089 10%     
#> 3 virginica  Petal.Width     6 2.45  0.055 10%

# Example 3: Complex example with multiple percentages and grouping variables
# Reshape data from wide to long format for Sepal.Length and Sepal.Width
iris |> 
  tidyr::pivot_longer(1:2,
                      names_to = "names", 
                      values_to = "values") |> 
  mintyr::top_perc(
    perc = c(0.1, -0.2),
    trait = "values",
    by = c("Species", "names"),
    type = "mean_sd")
#> # A tibble: 12 × 7
#>    Species    names        variable     n  mean    sd top_perc
#>    <fct>      <chr>        <fct>    <dbl> <dbl> <dbl> <chr>   
#>  1 setosa     Sepal.Length values       5  5.64 0.134 10%     
#>  2 setosa     Sepal.Width  values       6  4.08 0.194 10%     
#>  3 versicolor Sepal.Length values       6  6.8  0.126 10%     
#>  4 versicolor Sepal.Width  values       5  3.26 0.089 10%     
#>  5 virginica  Sepal.Length values       5  7.74 0.089 10%     
#>  6 virginica  Sepal.Width  values       5  3.6  0.2   10%     
#>  7 setosa     Sepal.Length values      11  4.53 0.135 -20%    
#>  8 setosa     Sepal.Width  values      12  2.97 0.219 -20%    
#>  9 versicolor Sepal.Length values      11  5.28 0.244 -20%    
#> 10 versicolor Sepal.Width  values      13  2.35 0.151 -20%    
#> 11 virginica  Sepal.Length values      11  5.79 0.336 -20%    
#> 12 virginica  Sepal.Width  values      11  2.56 0.15  -20%