Title: | Simplifies Exploratory Data Analysis |
---|---|
Description: | Interactive data exploration with one line of code, automated reporting or use an easy to remember set of tidy functions for low code exploratory data analysis. |
Authors: | Roland Krasser [aut, cre] |
Maintainer: | Roland Krasser <[email protected]> |
License: | MIT + file LICENSE |
Version: | 1.3.3 |
Built: | 2025-02-04 05:57:25 UTC |
Source: | https://github.com/rolkra/explore |
A/B testing
abtest(data, expr, n, target, sign_level = 0.05, color = "grey")
abtest(data, expr, n, target, sign_level = 0.05, color = "grey")
data |
A dataset. If no data is provided, a shiny app is launched |
expr |
Logical expression, that return in a FALSE/TRUE |
n |
A Variable for number of observations (count data) |
target |
Target variable |
sign_level |
Significance Level (typical 0.01/0.05/0.10) |
color |
Fill color of bar/violin-plot |
Plot that shows if difference is significant
## Using chi2-test or t-test depending on target type data <- create_data_buy(obs = 100) abtest(data, female_ind == 1, target = buy) # chi2 test abtest(data, city_ind == 1, target = age) # t test ## If small number of observations, Fisher's Exact test ## is used for a binary target (if <= 5 observations in a subgroup) data <- create_data_buy(obs = 25, seed = 1) abtest(data, female_ind == 1, target = buy) # Fisher's Exact test
## Using chi2-test or t-test depending on target type data <- create_data_buy(obs = 100) abtest(data, female_ind == 1, target = buy) # chi2 test abtest(data, city_ind == 1, target = age) # t test ## If small number of observations, Fisher's Exact test ## is used for a binary target (if <= 5 observations in a subgroup) data <- create_data_buy(obs = 25, seed = 1) abtest(data, female_ind == 1, target = buy) # Fisher's Exact test
Launches a shiny app to A/B test
abtest_shiny( size_a = 100, size_b = 100, success_a = 10, success_b = 20, success_unit = "percent", sign_level = 0.05 )
abtest_shiny( size_a = 100, size_b = 100, success_a = 10, success_b = 20, success_unit = "percent", sign_level = 0.05 )
size_a |
Size of Group A |
size_b |
Size of Group B |
success_a |
Success of Group A |
success_b |
Success of Group B |
success_unit |
"count" | "percent" |
sign_level |
Significance Level (typical 0.01/0.05/0.10) |
# Only run examples in interactive R sessions if (interactive()) { abtest_shiny() }
# Only run examples in interactive R sessions if (interactive()) { abtest_shiny() }
A/B testing comparing two mean
abtest_targetnum(data, expr, target, sign_level = 0.05, color = "grey")
abtest_targetnum(data, expr, target, sign_level = 0.05, color = "grey")
data |
A dataset |
expr |
Expression, that results in a FALSE/TRUE |
target |
Target variable (must be numeric) |
sign_level |
Significance Level (typical 0.01/0.05/0.10) |
color |
fill color |
Plot that shows if difference is significant
data <- create_data_buy(obs = 100) abtest(data, city_ind == 1, target = age)
data <- create_data_buy(obs = 100) abtest(data, city_ind == 1, target = age)
A/B testing comparing percent per group
abtest_targetpct( data, expr, n, target, sign_level = 0.05, group_label, ab_label = FALSE, color = "grey" )
abtest_targetpct( data, expr, n, target, sign_level = 0.05, group_label, ab_label = FALSE, color = "grey" )
data |
A dataset |
expr |
Expression, that results in a FALSE/TRUE |
n |
A Variable for number of observations (count data) |
target |
Target variable (must be 0/1 or FALSE/TRUE) |
sign_level |
Significance Level (typical 0.01/0.05/0.10) |
group_label |
Label of groups (default = expr) |
ab_label |
Label Groups as A and B (default = FALSE) |
color |
color of bar |
Plot that shows if difference is significant
data <- create_data_buy(obs = 100) abtest(data, female_ind == 1, target = buy) abtest(data, age >= 40, target = buy)
data <- create_data_buy(obs = 100) abtest(data, female_ind == 1, target = buy) abtest(data, age >= 40, target = buy)
Add a variable id at first column in dataset
add_var_id(data, name = "id", overwrite = FALSE)
add_var_id(data, name = "id", overwrite = FALSE)
data |
A dataset |
name |
Name of new variable (as string) |
overwrite |
Can new id variable overwrite an existing variable in dataset? |
Data set containing new id variable
library(magrittr) iris %>% add_var_id() %>% head() iris %>% add_var_id(name = "iris_nr") %>% head()
library(magrittr) iris %>% add_var_id() %>% head() iris %>% add_var_id(name = "iris_nr") %>% head()
Add a random 0/1 variable to dataset
add_var_random_01( data, name = "random_01", prob = c(0.5, 0.5), overwrite = TRUE, seed )
add_var_random_01( data, name = "random_01", prob = c(0.5, 0.5), overwrite = TRUE, seed )
data |
A dataset |
name |
Name of new variable (as string) |
prob |
Vector of probabilities |
overwrite |
Can new random variable overwrite an existing variable in dataset? |
seed |
Seed for random number generation (integer) |
Dataset containing new random variable
library(magrittr) iris %>% add_var_random_01() %>% head() iris %>% add_var_random_01(name = "my_var") %>% head()
library(magrittr) iris %>% add_var_random_01() %>% head() iris %>% add_var_random_01(name = "my_var") %>% head()
Add a random categorical variable to dataset
add_var_random_cat( data, name = "random_cat", cat = LETTERS[1:6], prob, overwrite = TRUE, seed )
add_var_random_cat( data, name = "random_cat", cat = LETTERS[1:6], prob, overwrite = TRUE, seed )
data |
A dataset |
name |
Name of new variable (as string) |
cat |
Vector of categories |
prob |
Vector of probabilities |
overwrite |
Can new random variable overwrite an existing variable in dataset? |
seed |
Seed for random number generation (integer) |
Dataset containing new random variable
library(magrittr) iris %>% add_var_random_cat() %>% head() iris %>% add_var_random_cat(name = "my_cat") %>% head() iris %>% add_var_random_cat(cat = c("Version A", "Version B")) %>% head() iris %>% add_var_random_cat(cat = c(1,2,3,4,5)) %>% head()
library(magrittr) iris %>% add_var_random_cat() %>% head() iris %>% add_var_random_cat(name = "my_cat") %>% head() iris %>% add_var_random_cat(cat = c("Version A", "Version B")) %>% head() iris %>% add_var_random_cat(cat = c(1,2,3,4,5)) %>% head()
Add a random double variable to dataset
add_var_random_dbl( data, name = "random_dbl", min_val = 0, max_val = 100, overwrite = TRUE, seed )
add_var_random_dbl( data, name = "random_dbl", min_val = 0, max_val = 100, overwrite = TRUE, seed )
data |
A dataset |
name |
Name of new variable (as string) |
min_val |
Minimum random integers |
max_val |
Maximum random integers |
overwrite |
Can new random variable overwrite an existing variable in dataset? |
seed |
Seed for random number generation (integer) |
Dataset containing new random variable
library(magrittr) iris %>% add_var_random_dbl() %>% head() iris %>% add_var_random_dbl(name = "random_var") %>% head() iris %>% add_var_random_dbl(min_val = 1, max_val = 10) %>% head()
library(magrittr) iris %>% add_var_random_dbl() %>% head() iris %>% add_var_random_dbl(name = "random_var") %>% head() iris %>% add_var_random_dbl(min_val = 1, max_val = 10) %>% head()
Add a random integer variable to dataset
add_var_random_int( data, name = "random_int", min_val = 1, max_val = 10, overwrite = TRUE, seed )
add_var_random_int( data, name = "random_int", min_val = 1, max_val = 10, overwrite = TRUE, seed )
data |
A dataset |
name |
Name of new variable (as string) |
min_val |
Minimum random integers |
max_val |
Maximum random integers |
overwrite |
Can new random variable overwrite an existing variable in dataset? |
seed |
Seed for random number generation (integer) |
Dataset containing new random variable
library(magrittr) iris %>% add_var_random_int() %>% head() iris %>% add_var_random_int(name = "random_var") %>% head() iris %>% add_var_random_int(min_val = 1, max_val = 10) %>% head()
library(magrittr) iris %>% add_var_random_int() %>% head() iris %>% add_var_random_int(name = "random_var") %>% head() iris %>% add_var_random_int(min_val = 1, max_val = 10) %>% head()
Add a random moon variable to dataset
add_var_random_moon(data, name = "random_moon", overwrite = TRUE, seed)
add_var_random_moon(data, name = "random_moon", overwrite = TRUE, seed)
data |
A dataset |
name |
Name of new variable (as string) |
overwrite |
Can new random variable overwrite an existing variable in dataset? |
seed |
Seed for random number generation (integer) |
Dataset containing new random variable
library(magrittr) iris %>% add_var_random_moon() %>% head()
library(magrittr) iris %>% add_var_random_moon() %>% head()
Add a random starsign variable to dataset
add_var_random_starsign( data, name = "random_starsign", lang = "en", overwrite = TRUE, seed )
add_var_random_starsign( data, name = "random_starsign", lang = "en", overwrite = TRUE, seed )
data |
A dataset |
name |
Name of new variable (as string) |
lang |
Language used for starsign (en = English, de = Deutsch, es = Espanol) |
overwrite |
Can new random variable overwrite an existing variable in dataset? |
seed |
Seed for random number generation (integer) |
Dataset containing new random variable
library(magrittr) iris %>% add_var_random_starsign() %>% head() iris %>% add_var_random_starsign(lang = "de") %>% head()
library(magrittr) iris %>% add_var_random_starsign() %>% head() iris %>% add_var_random_starsign(lang = "de") %>% head()
Balances the target variable in your dataset using downsampling. Target must be 0/1, FALSE/TRUE ore no/yes
balance_target(data, target, min_prop = 0.1, seed)
balance_target(data, target, min_prop = 0.1, seed)
data |
A dataset |
target |
Target variable (0/1, TRUE/FALSE, yes/no) |
min_prop |
Minimum proportion of one of the target categories |
seed |
Seed for random number generator |
Data
iris$is_versicolor <- ifelse(iris$Species == "versicolor", 1, 0) balanced <- balance_target(iris, target = is_versicolor, min_prop = 0.5) describe(balanced, is_versicolor)
iris$is_versicolor <- ifelse(iris$Species == "versicolor", 1, 0) balanced <- balance_target(iris, target = is_versicolor, min_prop = 0.5) describe(balanced, is_versicolor)
Check vector for low variance
check_vec_low_variance(values, max_prop = 0.99)
check_vec_low_variance(values, max_prop = 0.99)
values |
Vector of values |
max_prop |
Maximum proportion of values without variance |
TRUE/FALSE (low variance)
## Not run: values <- c(1, rep(0 ,1000)) check_vec_low_variance(values, max_prop = 0.9) ## End(Not run)
## Not run: values <- c(1, rep(0 ,1000)) check_vec_low_variance(values, max_prop = 0.9) ## End(Not run)
Clean variable (replace NA values, set min_val and max_val)
clean_var( data, var, na = NA, min_val = NA, max_val = NA, max_cat = NA, rescale01 = FALSE, simplify_text = FALSE, name = NA )
clean_var( data, var, na = NA, min_val = NA, max_val = NA, max_cat = NA, rescale01 = FALSE, simplify_text = FALSE, name = NA )
data |
A dataset |
var |
Name of variable |
na |
Value that replaces NA |
min_val |
All values < min_val are converted to min_val (var numeric or character) |
max_val |
All values > max_val are converted to max_val (var numeric or character) |
max_cat |
Maximum number of different factor levels for categorical variable (if more, .OTHER is added) |
rescale01 |
IF TRUE, value is rescaled between 0 and 1 (var must be numeric) |
simplify_text |
If TRUE, a character variable is simplified (trim, upper, ...) |
name |
New name of variable (as string) |
Dataset
library(magrittr) iris %>% clean_var(Sepal.Width, max_val = 3.5, name = "sepal_width") %>% head() iris %>% clean_var(Sepal.Width, rescale01 = TRUE) %>% head()
library(magrittr) iris %>% clean_var(Sepal.Width, max_val = 3.5, name = "sepal_width") %>% head() iris %>% clean_var(Sepal.Width, rescale01 = TRUE) %>% head()
Adds variables total and pct (percentage) to dplyr::count()
count_pct(data, ...)
count_pct(data, ...)
data |
A dataset |
... |
Other parameters passed to count() |
Dataset
count_pct(iris, Species)
count_pct(iris, Species)
Data that can be used for unit-testing or teaching
create_data_abtest( n_a = 100, n_b = 100, success_a = 10, success_b = 5, success_unit = "count", count = TRUE )
create_data_abtest( n_a = 100, n_b = 100, success_a = 10, success_b = 5, success_unit = "count", count = TRUE )
n_a |
Total size of group A |
n_b |
Total size of group B |
success_a |
Success in group A |
success_b |
Success in group B |
success_unit |
Unit ("count"|"percent") |
count |
Create as count-data (FALSE|TRUE) |
A dataset as tibble
library(dplyr) create_data_abtest() %>% abtest() create_data_abtest( n_a = 100, n_b = 100, success_a = 20, success_b = 30, success_unit = "count" ) %>% abtest()
library(dplyr) create_data_abtest() %>% abtest() create_data_abtest( n_a = 100, n_b = 100, success_a = 20, success_b = 30, success_unit = "count" ) %>% abtest()
Artificial data that can be used for unit-testing or teaching
create_data_app(obs = 1000, add_id = FALSE, seed = 123)
create_data_app(obs = 1000, add_id = FALSE, seed = 123)
obs |
Number of observations |
add_id |
Add an id-variable to data? |
seed |
Seed for randomization (integer) |
A dataset as tibble
create_data_app()
create_data_app()
Artificial data that can be used for unit-testing or teaching
create_data_buy( obs = 1000, target_name = "buy", factorise_target = FALSE, target1_prob = 0.5, add_extreme = TRUE, flip_gender = FALSE, add_id = FALSE, seed = 123 )
create_data_buy( obs = 1000, target_name = "buy", factorise_target = FALSE, target1_prob = 0.5, add_extreme = TRUE, flip_gender = FALSE, add_id = FALSE, seed = 123 )
obs |
Number of observations |
target_name |
Variable name of target |
factorise_target |
Should target variable be factorised? (from 0/1 to factor no/yes)? |
target1_prob |
Probability that target = 1 |
add_extreme |
Add an observation with extreme values? |
flip_gender |
Should Male/Female be flipped in data? |
add_id |
Add an id-variable to data? |
seed |
Seed for randomization |
Variables in dataset:
id = Identifier
period = Year & Month (YYYYMM)
city_ind = Indicating if customer is residing in a city (1 = yes, 0 = no)
female_ind = Gender of customer is female (1 = yes, 0 = no)
fixedvoice_ind = Customer has a fixed voice product (1 = yes, 0 = no)
fixeddata_ind = Customer has a fixed data product (1 = yes, 0 = no)
fixedtv_ind = Customer has a fixed TV product (1 = yes, 0 = no)
mobilevoice_ind = Customer has a mobile voice product (1 = yes, 0 = no)
mobiledata_prd = Customer has a mobile data product (NO/MOBILE STICK/BUSINESS)
bbi_speed_ind = Customer has a Broadband Internet (BBI) with extra speed
bbi_usg_gb = Broadband Internet (BBI) usage in Gigabyte (GB) last month
hh_single = Expected to be a Single Household (1 = yes, 0 = no)
Target in dataset:
buy (may be renamed) = Did customer buy a new product in next month? (1 = yes, 0 = no)
A dataset as tibble
create_data_buy()
create_data_buy()
Artificial data that can be used for unit-testing or teaching
create_data_churn( obs = 1000, target_name = "churn", factorise_target = FALSE, target1_prob = 0.4, add_id = FALSE, seed = 123 )
create_data_churn( obs = 1000, target_name = "churn", factorise_target = FALSE, target1_prob = 0.4, add_id = FALSE, seed = 123 )
obs |
Number of observations |
target_name |
Variable name of target |
factorise_target |
Should target variable be factorised? |
target1_prob |
Probability that target = 1 |
add_id |
Add an id-variable to data? |
seed |
Seed for randomization (integer) |
A dataset as tibble
create_data_churn()
create_data_churn()
Create an empty dataset
create_data_empty(obs = 1000, add_id = FALSE)
create_data_empty(obs = 1000, add_id = FALSE)
obs |
Number of observations |
add_id |
Add an id |
Dataset as tibble
create_data_empty(obs = 100) create_data_empty(obs = 100, add_id = TRUE)
create_data_empty(obs = 100) create_data_empty(obs = 100, add_id = TRUE)
Random data that can be used for unit-testing or teaching
create_data_esoteric(obs = 1000, add_id = FALSE, seed = 123)
create_data_esoteric(obs = 1000, add_id = FALSE, seed = 123)
obs |
Number of observations |
add_id |
Add an id-variable to data? |
seed |
Seed for randomization |
Variables in dataset:
id = Identifier
starsign = random starsign
chinese = random chinese zodiac
moon = random moon phase
blood = random blood type
fingers_crossed = random fingers crossed (1 = yes, 0 = no)
success = random success (1 = yes, 0 = no)
A dataset as tibble
create_data_esoteric(obs = 100)
create_data_esoteric(obs = 100)
Artificial data that can be used for unit-testing or teaching (fairness & AI bias)
create_data_newsletter(obs = 1000, add_id = FALSE, seed = 123)
create_data_newsletter(obs = 1000, add_id = FALSE, seed = 123)
obs |
Number of observations |
add_id |
Add an id-variable to data? |
seed |
Seed for randomization (integer) |
A dataset as tibble
create_data_newsletter()
create_data_newsletter()
Artificial data that can be used for unit-testing or teaching
create_data_person(obs = 1000, add_id = FALSE, seed = 123)
create_data_person(obs = 1000, add_id = FALSE, seed = 123)
obs |
Number of observations |
add_id |
Add an id |
seed |
Seed for randomization (integer) |
A dataset as tibble
create_data_person()
create_data_person()
Random data that can be used for unit-testing or teaching
create_data_random( obs = 1000, vars = 10, target_name = "target_ind", factorise_target = FALSE, target1_prob = 0.5, add_id = TRUE, seed = 123 )
create_data_random( obs = 1000, vars = 10, target_name = "target_ind", factorise_target = FALSE, target1_prob = 0.5, add_id = TRUE, seed = 123 )
obs |
Number of observations |
vars |
Number of variables |
target_name |
Variable name of target |
factorise_target |
Should target variable be factorised? (from 0/1 to facotr no/yes)? |
target1_prob |
Probability that target = 1 |
add_id |
Add an id-variable to data? |
seed |
Seed for randomization |
Variables in dataset:
id = Identifier
var_X = variable containing values between 0 and 100
Target in dataset:
target_ind (may be renamed) = random values (1 = yes, 0 = no)
A dataset as tibble
create_data_random(obs = 100, vars = 5)
create_data_random(obs = 100, vars = 5)
Artificial data that can be used for unit-testing or teaching (fairness & AI bias)
create_data_unfair( obs = 1000, target_name = "target_ind", factorise_target = FALSE, target1_prob = 0.25, add_id = FALSE, seed = 123 )
create_data_unfair( obs = 1000, target_name = "target_ind", factorise_target = FALSE, target1_prob = 0.25, add_id = FALSE, seed = 123 )
obs |
Number of observations |
target_name |
Variable name of target |
factorise_target |
Should target variable be factorised? |
target1_prob |
Probability that target = 1 |
add_id |
Add an id-variable to data? |
seed |
Seed for randomization (integer) |
A dataset as tibble
create_data_unfair()
create_data_unfair()
Generate an RMarkdown Notebook template for a report. You must provide a output-directory (parameter output_dir). The default file-name is "notebook-explore.Rmd" (may overwrite existing file with same name)
create_notebook_explore(output_file = "notebook-explore.Rmd", output_dir)
create_notebook_explore(output_file = "notebook-explore.Rmd", output_dir)
output_file |
Filename of the html report |
output_dir |
Directory where to save the html report |
create_notebook_explore(output_file = "explore.Rmd", output_dir = tempdir())
create_notebook_explore(output_file = "explore.Rmd", output_dir = tempdir())
Cut a variable
cut_vec_num_avg(values, bins = 8)
cut_vec_num_avg(values, bins = 8)
values |
Variable |
bins |
Number of bins |
Data frame
Create a data dictionary Markdown file
data_dict_md( data, title = "", description = NA, output_file = "data_dict.md", output_dir )
data_dict_md( data, title = "", description = NA, output_file = "data_dict.md", output_dir )
data |
A dataframe (data dictionary for all variables) |
title |
Title of the data dictionary |
description |
Detailed description of variables in data (dataframe with columns 'variable' and 'description') |
output_file |
Output filename for Markdown file |
output_dir |
Directory where the Markdown file is saved |
Create Markdown file
# Data dictionary of a dataframe data_dict_md(iris, title = "iris flower data set", output_dir = tempdir()) # Data dictionary of a dataframe with additional description of variables description <- data.frame( variable = c("Species"), description = c("Species of Iris flower")) data_dict_md(iris, title = "iris flower data set", description = description, output_dir = tempdir())
# Data dictionary of a dataframe data_dict_md(iris, title = "iris flower data set", output_dir = tempdir()) # Data dictionary of a dataframe with additional description of variables description <- data.frame( variable = c("Species"), description = c("Species of Iris flower")) data_dict_md(iris, title = "iris flower data set", description = description, output_dir = tempdir())
decrypt text
decrypt(text, codeletters = c(toupper(letters), letters, 0:9), shift = 18)
decrypt(text, codeletters = c(toupper(letters), letters, 0:9), shift = 18)
text |
A text (character) |
codeletters |
A string of letters that are used for decryption |
shift |
Number of elements shifted |
Decrypted text
decrypt("zw336 E693v")
decrypt("zw336 E693v")
Describe a dataset or variable (depending on input parameters)
describe(data, var, n, target, out = "text", ...)
describe(data, var, n, target, out = "text", ...)
data |
A dataset |
var |
A variable of the dataset |
n |
Weights variable for count-data |
target |
Target variable (0/1 or FALSE/TRUE) |
out |
Output format ("text"|"list") of variable description |
... |
Further arguments |
Description as table, text or list
# Load package library(magrittr) # Describe a dataset iris %>% describe() # Describe a variable iris %>% describe(Species) iris %>% describe(Sepal.Length)
# Load package library(magrittr) # Describe a dataset iris %>% describe() # Describe a variable iris %>% describe(Species) iris %>% describe(Sepal.Length)
Describe all variables of a dataset
describe_all(data, out = "large")
describe_all(data, out = "large")
data |
A dataset |
out |
Output format ("small"|"large") |
Dataset (tibble)
describe_all(iris)
describe_all(iris)
Describe categorical variable
describe_cat(data, var, n, max_cat = 10, out = "text", margin = 0)
describe_cat(data, var, n, max_cat = 10, out = "text", margin = 0)
data |
A dataset |
var |
Variable or variable name |
n |
Weights variable for count-data |
max_cat |
Maximum number of categories displayed |
out |
Output format ("text"|"list"|"tibble"|"df") |
margin |
Left margin for text output (number of spaces) |
Description as text or list
describe_cat(iris, Species)
describe_cat(iris, Species)
Describe numerical variable
describe_num(data, var, n, out = "text", margin = 0)
describe_num(data, var, n, out = "text", margin = 0)
data |
A dataset |
var |
Variable or variable name |
n |
Weights variable for count-data |
out |
Output format ("text"|"list") |
margin |
Left margin for text output (number of spaces) |
Description as text or list
describe_num(iris, Sepal.Length)
describe_num(iris, Sepal.Length)
Describe table (e.g. number of rows and columns of dataset)
describe_tbl(data, n, target, out = "text")
describe_tbl(data, n, target, out = "text")
data |
A dataset |
n |
Weights variable for count-data |
target |
Target variable (binary) |
out |
Output format ("text"|"list") |
Description as text or list
describe_tbl(iris) iris[1,1] <- NA describe_tbl(iris)
describe_tbl(iris) iris[1,1] <- NA describe_tbl(iris)
Drop all observations where expression is true
drop_obs_if(data, expr)
drop_obs_if(data, expr)
data |
Data frame |
expr |
Expression |
Data frame
drop_obs_if(iris, Species == "setosa") drop_obs_if(iris, Sepal.Length < 5 | Sepal.Length >7)
drop_obs_if(iris, Species == "setosa") drop_obs_if(iris, Sepal.Length < 5 | Sepal.Length >7)
Drop all observations with NA-values
drop_obs_with_na(data)
drop_obs_with_na(data)
data |
Data frame |
Data frame
data <- data.frame(a = 1:10, b = rep("A",10)) data[1,1] <- NA drop_obs_with_na(data)
data <- data.frame(a = 1:10, b = rep("A",10)) data[1,1] <- NA drop_obs_with_na(data)
Drop variables by name
drop_var_by_names(data, var_names)
drop_var_by_names(data, var_names)
data |
Data frame |
var_names |
Vector of variable names (as string) |
Data frame
drop_var_by_names(iris, "Species") drop_var_by_names(iris, c("Sepal.Length", "Sepal.Width"))
drop_var_by_names(iris, "Species") drop_var_by_names(iris, c("Sepal.Length", "Sepal.Width"))
Drop all variables with low variance
drop_var_low_variance(data, max_prop = 0.99)
drop_var_low_variance(data, max_prop = 0.99)
data |
Data frame |
max_prop |
Maximum proportion of values without variance |
Data frame
data <- data.frame(a = 1:100, b = c(0, rep(1, 99))) drop_var_low_variance(data, max_prop = 0.9)
data <- data.frame(a = 1:100, b = c(0, rep(1, 99))) drop_var_low_variance(data, max_prop = 0.9)
Drop all variables with no variance
drop_var_no_variance(data)
drop_var_no_variance(data)
data |
Data frame |
Data frame
data <- data.frame(a = 1:10, b = rep(1,10)) drop_var_no_variance(data)
data <- data.frame(a = 1:10, b = rep(1,10)) drop_var_no_variance(data)
Drop all not numeric variables
drop_var_not_numeric(data)
drop_var_not_numeric(data)
data |
Data frame |
Data frame
data <- data.frame(a = 1:10, b = rep("A",10)) drop_var_not_numeric(data)
data <- data.frame(a = 1:10, b = rep("A",10)) drop_var_not_numeric(data)
Drop all variables with NA-values
drop_var_with_na(data)
drop_var_with_na(data)
data |
Data frame |
Data frame
data <- data.frame(a = 1:10, b = rep(NA,10)) drop_var_with_na(data)
data <- data.frame(a = 1:10, b = rep(NA,10)) drop_var_with_na(data)
encrypt text
encrypt(text, codeletters = c(toupper(letters), letters, 0:9), shift = 18)
encrypt(text, codeletters = c(toupper(letters), letters, 0:9), shift = 18)
text |
A text (character) |
codeletters |
A string of letters that are used for encryption |
shift |
Number of elements shifted |
Encrypted text
encrypt("hello world")
encrypt("hello world")
Explain a target using Random Forest.
explain_forest(data, target, ntree = 50, out = "plot", ...)
explain_forest(data, target, ntree = 50, out = "plot", ...)
data |
A dataset |
target |
Target variable (binary) |
ntree |
Number of trees used for Random Forest |
out |
Output of the function: "plot" | "model" | "importance" | all" |
... |
Further arguments |
Plot of importance (if out = "plot")
data <- create_data_buy() explain_forest(data, target = buy)
data <- create_data_buy() explain_forest(data, target = buy)
MASS::stepAIC()
).Explain a binary target using a logistic regression (glm).
Model chosen by AIC in a Stepwise Algorithm (MASS::stepAIC()
).
explain_logreg(data, target, out = "tibble", ...)
explain_logreg(data, target, out = "tibble", ...)
data |
A dataset |
target |
Target variable (binary) |
out |
Output of the function: "tibble" | "model" |
... |
Further arguments |
Dataset with results (term, estimate, std.error, z.value, p.value)
data <- iris data$is_versicolor <- ifelse(iris$Species == "versicolor", 1, 0) data$Species <- NULL explain_logreg(data, target = is_versicolor)
data <- iris data$is_versicolor <- ifelse(iris$Species == "versicolor", 1, 0) data$Species <- NULL explain_logreg(data, target = is_versicolor)
Explain a target using a simple decision tree (classification or regression)
explain_tree( data, target, n, max_cat = 10, max_target_cat = 5, maxdepth = 3, minsplit = 20, cp = 0, weights = NA, size = 0.7, out = "plot", ... )
explain_tree( data, target, n, max_cat = 10, max_target_cat = 5, maxdepth = 3, minsplit = 20, cp = 0, weights = NA, size = 0.7, out = "plot", ... )
data |
A dataset |
target |
Target variable |
n |
weights variable (for count data) |
max_cat |
Drop categorical variables with higher number of levels |
max_target_cat |
Maximum number of categories to be plotted for target (except NA) |
maxdepth |
Set the maximum depth of any node of the final tree, with the root
node counted as depth 0. Values greater than 30 |
minsplit |
the minimum number of observations that must exist in a node in order for a split to be attempted. |
cp |
complexity parameter. Any split that does not decrease the overall
lack of fit by a factor of |
weights |
optional case weights. |
size |
Text size of plot |
out |
Output of function: "plot" | "model" |
... |
Further arguments |
Plot or additional the model (if out = "model")
data <- iris data$is_versicolor <- ifelse(iris$Species == "versicolor", 1, 0) data$Species <- NULL explain_tree(data, target = is_versicolor)
data <- iris data$is_versicolor <- ifelse(iris$Species == "versicolor", 1, 0) data$Species <- NULL explain_tree(data, target = is_versicolor)
Based on the hyperparameters defined in the setup parameter, XGBoost hyperparameter-tuning is carried out using cross-validation. The best model is chosen and returned. As default, the function returns the feature-importance plot. To get the all outputs, use parameter out = "all"
explain_xgboost( data, target, log = TRUE, nthread = 1, setup = list(cv_nfold = 2, max_nrounds = 1000, early_stopping_rounds = 50, grid_xgboost = list(eta = c(0.3, 0.1, 0.01), max_depth = c(3, 5), gamma = 0, colsample_bytree = 0.8, subsample = 0.8, min_child_weight = 1, scale_pos_weight = 1)), out = "plot" )
explain_xgboost( data, target, log = TRUE, nthread = 1, setup = list(cv_nfold = 2, max_nrounds = 1000, early_stopping_rounds = 50, grid_xgboost = list(eta = c(0.3, 0.1, 0.01), max_depth = c(3, 5), gamma = 0, colsample_bytree = 0.8, subsample = 0.8, min_child_weight = 1, scale_pos_weight = 1)), out = "plot" )
data |
Data frame, must contain variable defined in target, but should not contain any customer-IDs or date/period columns |
target |
Target variable (must be binary 0/1, FALSE/TRUE, no/yes) |
log |
Log? |
nthread |
Number of threads used for training |
setup |
Setup of model |
out |
Output of the function: "plot" | "model" | "importance" | all" |
Plot of importance (if out = "plot")
data <- use_data_iris() data$is_versicolor <- ifelse(data$Species == "versicolor", 1, 0) data$Species <- NULL explain_xgboost(data, target = is_versicolor, log = FALSE)
data <- use_data_iris() data$is_versicolor <- ifelse(data$Species == "versicolor", 1, 0) data$Species <- NULL explain_xgboost(data, target = is_versicolor, log = FALSE)
Explore a dataset or variable
explore( data, var, var2, n, target, targetpct, split, min_val = NA, max_val = NA, auto_scale = TRUE, na = NA, ... )
explore( data, var, var2, n, target, targetpct, split, min_val = NA, max_val = NA, auto_scale = TRUE, na = NA, ... )
data |
A dataset |
var |
A variable |
var2 |
A variable for checking correlation |
n |
A Variable for number of observations (count data) |
target |
Target variable (0/1 or |
targetpct |
Plot variable as target% ( |
split |
Alternative to targetpct (split = !targetpct) |
min_val |
All values < min_val are converted to |
max_val |
All values > max_val are converted to |
auto_scale |
Use 0.2 and 0.98 quantile for |
na |
Value to replace |
... |
Further arguments (like flip = |
Plot object
## Launch Shiny app (in interactive R sessions) if (interactive()) { explore(iris) } ## Explore grafically # Load library library(magrittr) # Explore a variable iris %>% explore(Species) iris %>% explore(Sepal.Length) iris %>% explore(Sepal.Length, min_val = 4, max_val = 7) # Explore a variable with a target iris$is_virginica <- ifelse(iris$Species == "virginica", 1, 0) iris %>% explore(Species, target = is_virginica) iris %>% explore(Sepal.Length, target = is_virginica) # Explore correlation between two variables iris %>% explore(Species, Petal.Length) iris %>% explore(Sepal.Length, Petal.Length) # Explore correlation between two variables and split by target iris %>% explore(Sepal.Length, Petal.Length, target = is_virginica)
## Launch Shiny app (in interactive R sessions) if (interactive()) { explore(iris) } ## Explore grafically # Load library library(magrittr) # Explore a variable iris %>% explore(Species) iris %>% explore(Sepal.Length) iris %>% explore(Sepal.Length, min_val = 4, max_val = 7) # Explore a variable with a target iris$is_virginica <- ifelse(iris$Species == "virginica", 1, 0) iris %>% explore(Species, target = is_virginica) iris %>% explore(Sepal.Length, target = is_virginica) # Explore correlation between two variables iris %>% explore(Species, Petal.Length) iris %>% explore(Sepal.Length, Petal.Length) # Explore correlation between two variables and split by target iris %>% explore(Sepal.Length, Petal.Length, target = is_virginica)
Explore all variables of a dataset (create plots)
explore_all( data, n, target, ncol = 2, targetpct, color = c("#ADD8E6", "#7BB8DA"), split = TRUE )
explore_all( data, n, target, ncol = 2, targetpct, color = c("#ADD8E6", "#7BB8DA"), split = TRUE )
data |
A dataset |
n |
Weights variable (only for count data) |
target |
Target variable (0/1 or FALSE/TRUE) |
ncol |
Layout of plots (number of columns) |
targetpct |
Plot variable as target% (FALSE/TRUE) |
color |
Forece a default color (if possible) |
split |
Split by target (TRUE|FALSE) |
Plot
explore_all(iris) iris$is_virginica <- ifelse(iris$Species == "virginica", 1, 0) explore_all(iris, target = is_virginica)
explore_all(iris) iris$is_virginica <- ifelse(iris$Species == "virginica", 1, 0) explore_all(iris, target = is_virginica)
Create a barplot to explore a categorical variable. If a target is selected, the barplot is created for all levels of the target.
explore_bar( data, var, target, flip = NA, title = "", numeric = NA, max_cat = 30, max_target_cat = 5, color = c("#ADD8E6", "#7BB8DA"), legend_position = "right", label, label_size = 2.7, ... )
explore_bar( data, var, target, flip = NA, title = "", numeric = NA, max_cat = 30, max_target_cat = 5, color = c("#ADD8E6", "#7BB8DA"), legend_position = "right", label, label_size = 2.7, ... )
data |
A dataset |
var |
variable |
target |
target (can have more than 2 levels) |
flip |
Should plot be flipped? (change of x and y) |
title |
Title of the plot (if empty var name) |
numeric |
Display variable as numeric (not category) |
max_cat |
Maximum number of categories to be plotted |
max_target_cat |
Maximum number of categories to be plotted for target (except NA) |
color |
Color for bar |
legend_position |
Position of the legend ("bottom"|"top"|"none") |
label |
Show labels? (if empty, automatic) |
label_size |
Size of labels |
... |
Further arguments |
Plot object (bar chart)
Label and Value are in the data. Create a bar plot where the heights of the bars represent the values for each label.
explore_col( data, var_label, var_value, title = NA, subtitle = "", numeric = FALSE, max_cat = 30, na = 0, flip = NA, color = "#ADD8E6" )
explore_col( data, var_label, var_value, title = NA, subtitle = "", numeric = FALSE, max_cat = 30, na = 0, flip = NA, color = "#ADD8E6" )
data |
A dataset (categories + frequency) |
var_label |
Variable containing the label |
var_value |
Variable containing the value |
title |
Title of the plot |
subtitle |
Subtitle of the plot |
numeric |
Display variable as numeric (not category) |
max_cat |
Maximum number of categories to be plotted |
na |
Value to use for NA |
flip |
Flip plot? (for categorical variables) |
color |
Color for bar |
Plot object
library(magrittr) data <- data.frame(label = LETTERS[1:5], value = c(1.5,2,1.2,3,2.6)) data %>% explore_col(label, value)
library(magrittr) data <- data.frame(label = LETTERS[1:5], value = c(1.5,2,1.2,3,2.6)) data %>% explore_col(label, value)
Explore the correlation between two variables
explore_cor( data, x, y, target, bins = 8, min_val = NA, max_val = NA, auto_scale = TRUE, title = NA, color = c("#ADD8E6", "#7BB8DA"), ... )
explore_cor( data, x, y, target, bins = 8, min_val = NA, max_val = NA, auto_scale = TRUE, title = NA, color = c("#ADD8E6", "#7BB8DA"), ... )
data |
A dataset |
x |
Variable on x axis |
y |
Variable on y axis |
target |
Target variable (categorical) |
bins |
Number of bins |
min_val |
All values < min_val are converted to min_val |
max_val |
All values > max_val are converted to max_val |
auto_scale |
Use 0.2 and 0.98 quantile for min_val and max_val (if min_val and max_val are not defined) |
title |
Title of the plot |
color |
Color of the plot |
... |
Further arguments |
Plot
explore_cor(iris, x = Sepal.Length, y = Sepal.Width)
explore_cor(iris, x = Sepal.Length, y = Sepal.Width)
Create a plot to explore count data (categories + freuency) Variable named 'n' is auto detected as Frequency
explore_count( data, cat, n, target, pct = FALSE, split = TRUE, title = NA, numeric = FALSE, max_cat = 30, max_target_cat = 5, color = c("#ADD8E6", "#7BB8DA"), flip = NA )
explore_count( data, cat, n, target, pct = FALSE, split = TRUE, title = NA, numeric = FALSE, max_cat = 30, max_target_cat = 5, color = c("#ADD8E6", "#7BB8DA"), flip = NA )
data |
A dataset (categories + frequency) |
cat |
Numerical variable |
n |
Number of observations (frequency) |
target |
Target variable |
pct |
Show as percent? |
split |
Split by target (FALSE/TRUE) |
title |
Title of the plot |
numeric |
Display variable as numeric (not category) |
max_cat |
Maximum number of categories to be plotted |
max_target_cat |
Maximum number of categories to be plotted for target (except NA) |
color |
Color for bar |
flip |
Flip plot? (for categorical variables) |
Plot object
library(dplyr) iris %>% count(Species) %>% explore_count(Species)
library(dplyr) iris %>% count(Species) %>% explore_count(Species)
Create a density plot to explore numerical variable
explore_density( data, var, target, title = "", min_val = NA, max_val = NA, color = c("#ADD8E6", "#7BB8DA"), auto_scale = TRUE, max_target_cat = 5, ... )
explore_density( data, var, target, title = "", min_val = NA, max_val = NA, color = c("#ADD8E6", "#7BB8DA"), auto_scale = TRUE, max_target_cat = 5, ... )
data |
A dataset |
var |
Variable |
target |
Target variable (0/1 or FALSE/TRUE) |
title |
Title of the plot (if empty var name) |
min_val |
All values < min_val are converted to min_val |
max_val |
All values > max_val are converted to max_val |
color |
Color of plot |
auto_scale |
Use 0.02 and 0.98 percent quantile for min_val and max_val (if min_val and max_val are not defined) |
max_target_cat |
Maximum number of levels of target shown in the plot (except NA). |
... |
Further arguments |
Plot object (density plot)
explore_density(iris, "Sepal.Length") iris$is_virginica <- ifelse(iris$Species == "virginica", 1, 0) explore_density(iris, Sepal.Length, target = is_virginica)
explore_density(iris, "Sepal.Length") iris$is_virginica <- ifelse(iris$Species == "virginica", 1, 0) explore_density(iris, Sepal.Length, target = is_virginica)
Launches a shiny app to explore a dataset
explore_shiny(data, target, color = c("#ADD8E6", "#7BB8DA"))
explore_shiny(data, target, color = c("#ADD8E6", "#7BB8DA"))
data |
A dataset |
target |
Target variable (0/1 or FALSE/TRUE) |
color |
Color for plots (vector) |
# Only run examples in interactive R sessions if (interactive()) { explore_shiny(iris) }
# Only run examples in interactive R sessions if (interactive()) { explore_shiny(iris) }
Create a plot to explore relation between a variable and a binary target as target percent. The target variable is choosen automatically if possible (name starts with 'target')
explore_targetpct( data, var, target = NULL, title = NA, min_val = NA, max_val = NA, auto_scale = TRUE, na = NA, flip = NA, ... )
explore_targetpct( data, var, target = NULL, title = NA, min_val = NA, max_val = NA, auto_scale = TRUE, na = NA, flip = NA, ... )
data |
A dataset |
var |
Numerical variable |
target |
Target variable (0/1 or FALSE/TRUE) |
title |
Title of the plot |
min_val |
All values < min_val are converted to min_val |
max_val |
All values > max_val are converted to max_val |
auto_scale |
Use 0.2 and 0.98 quantile for min_val and max_val (if min_val and max_val are not defined) |
na |
Value to replace NA |
flip |
Flip plot? (for categorical variables) |
... |
Further arguments |
Plot object
iris$target01 <- ifelse(iris$Species == "versicolor",1,0) explore_targetpct(iris)
iris$target01 <- ifelse(iris$Species == "versicolor",1,0) explore_targetpct(iris)
Explore a table. Plots variable types, variables with no variance and variables with NA
explore_tbl(data, n)
explore_tbl(data, n)
data |
A dataset |
n |
Weight variable for count data |
explore_tbl(iris)
explore_tbl(iris)
Formats a number depending on the value as number with space, scientific or big number as k (1 000), M (1 000 000) or B (1 000 000 000)
format_num_auto(number = 0, digits = 1)
format_num_auto(number = 0, digits = 1)
number |
A number (integer or real) |
digits |
Number of digits |
Formatted number as text
format_num_kMB(5500, digits = 2)
format_num_kMB(5500, digits = 2)
Formats a big number as k (1 000), M (1 000 000) or B (1 000 000 000)
format_num_kMB(number = 0, digits = 1)
format_num_kMB(number = 0, digits = 1)
number |
A number (integer or real) |
digits |
Number of digits |
Formatted number as text
format_num_kMB(5500, digits = 2)
format_num_kMB(5500, digits = 2)
Formats a big number using space as big.mark (1000 = 1 000)
format_num_space(number = 0, digits = 1)
format_num_space(number = 0, digits = 1)
number |
A number (integer or real) |
digits |
Number of digits |
Formatted number as text
format_num_space(5500, digits = 2)
format_num_space(5500, digits = 2)
Formats a target as a 0/1 variable. If target is numeric, 1 = above average.
format_target(target)
format_target(target)
target |
Variable as vector |
Formated target
iris$is_virginica <- ifelse(iris$Species == "virginica", "yes", "no") iris$target <- format_target(iris$is_virginica) table(iris$target)
iris$is_virginica <- ifelse(iris$Species == "virginica", "yes", "no") iris$target <- format_target(iris$is_virginica) table(iris$target)
Format type description of variable to 3 letters (int|dbl|lgl|chr|dat)
format_type(type)
format_type(type)
type |
Type description ("integer", "double", "logical", character", "date") |
Formatted type description (int|dbl|lgl|chr|dat)
format_type(typeof(iris$Species))
format_type(typeof(iris$Species))
Get predefined colors
get_color(name, fill = FALSE, fill_color = "#DDDDDD", fill_n = 10)
get_color(name, fill = FALSE, fill_color = "#DDDDDD", fill_n = 10)
name |
Name of color/color-vector |
fill |
Fill color vector? |
fill_color |
Color to use to fill color vector |
fill_n |
Number of color codes to return |
Vector of color-codes
get_color("mario") get_color("mario") show_color(get_color("mario")) show_color(get_color("mario", fill = TRUE, fill_n = 10)) col <- get_color("mario") explore(iris, Sepal.Length, target = Species, color = col) explore(iris, Sepal.Length, target = Species, color = c(col["peach"], col["bowser"], col["donkeykong"]))
get_color("mario") get_color("mario") show_color(get_color("mario")) show_color(get_color("mario", fill = TRUE, fill_n = 10)) col <- get_color("mario") explore(iris, Sepal.Length, target = Species, color = col) explore(iris, Sepal.Length, target = Species, color = c(col["peach"], col["bowser"], col["donkeykong"]))
Return value of typeof, except if variable contains hide, then return "other"
get_type(var)
get_type(var)
var |
A vector (dataframe column) |
Value of typeof or "other"
get_type(iris$Species)
get_type(iris$Species)
Put variables into "buckets" to create a set of plots instead one large plot
get_var_buckets(data, bucket_size = 100, var_name_target = NA, var_name_n = NA)
get_var_buckets(data, bucket_size = 100, var_name_target = NA, var_name_n = NA)
data |
A dataset |
bucket_size |
Maximum number of variables in one bucket |
var_name_target |
Name of the target variable (if defined) |
var_name_n |
Name of the weight (n) variable (if defined) |
Buckets as a list
get_var_buckets(iris) get_var_buckets(iris, bucket_size = 2) get_var_buckets(iris, bucket_size = 2, var_name_target = "Species")
get_var_buckets(iris) get_var_buckets(iris, bucket_size = 2) get_var_buckets(iris, bucket_size = 2, var_name_target = "Species")
Guess if variable is categorical or numerical based on name, type and values of variable
guess_cat_num(var, descr)
guess_cat_num(var, descr)
var |
A vector (dataframe column) |
descr |
A description of the variable (optional) |
"cat" (categorical), "num" (numerical) or "oth" (other)
guess_cat_num(iris$Species)
guess_cat_num(iris$Species)
Make a explore-plot interactive
interact(obj, lower_title = TRUE, hide_geom_text = TRUE)
interact(obj, lower_title = TRUE, hide_geom_text = TRUE)
obj |
A object (e.g. ggplot2-object) |
lower_title |
Lowering the title in ggplot2-object( |
hide_geom_text |
Hiding geom_text in ggplot2-object ( |
Plot object
library(dplyr) if (interactive()) { iris %>% explore(Sepal.Length, target = Species) %>% interact() }
library(dplyr) if (interactive()) { iris %>% explore(Sepal.Length, target = Species) %>% interact() }
Log conditional
log_info_if(log = TRUE, text = "log")
log_info_if(log = TRUE, text = "log")
log |
log (TRUE|FALSE) |
text |
text string to be logged |
prints log on screen (if log == TRUE).
Mix colors
mix_color(color1, color2 = NA, n = 5)
mix_color(color1, color2 = NA, n = 5)
color1 |
Color 1 |
color2 |
Color 2 |
n |
Number of different colors that should be generated |
Vector of color-codes
mix_color("blue", n = 10) mix_color("gold", "red", n = 4)
mix_color("blue", n = 10) mix_color("gold", "red", n = 4)
Plots a legend that can be used for explore_all with a binary target
plot_legend_targetpct(border = TRUE)
plot_legend_targetpct(border = TRUE)
border |
Draw a border? |
Base plot
plot_legend_targetpct(border = TRUE)
plot_legend_targetpct(border = TRUE)
Plots a text (base plot) and let you choose text-size and color
plot_text(text = "hello world", size = 1.2, color = "black", ggplot = FALSE)
plot_text(text = "hello world", size = 1.2, color = "black", ggplot = FALSE)
text |
Text as string |
size |
Text-size |
color |
Text-color |
ggplot |
return a ggplot-object? (or base plot) |
Plot
plot_text("hello", size = 2, color = "red")
plot_text("hello", size = 2, color = "red")
Creates a ggplot with the variable-name as title and a text
plot_var_info(data, var, info = "")
plot_var_info(data, var, info = "")
data |
A dataset |
var |
Variable |
info |
Text to plot |
Plot (ggplot)
Predict target using a trained model.
predict_target(data, model, name = "prediction")
predict_target(data, model, name = "prediction")
data |
A dataset (data.frame or tbl) |
model |
A model created with |
name |
Prefix of variable-name for prediction |
data containing predicted probabilities for target values
data_train <- create_data_buy(seed = 1) data_test <- create_data_buy(seed = 2) model <- explain_tree(data_train, target = buy, out = "model") data <- predict_target(data = data_test, model = model) describe(data)
data_train <- create_data_buy(seed = 1) data_test <- create_data_buy(seed = 2) model <- explain_tree(data_train, target = buy, out = "model") data <- predict_target(data = data_test, model = model) describe(data)
Replace NA values of a variable in a dataframe
replace_na_with(data, var_name, with)
replace_na_with(data, var_name, with)
data |
A dataframe |
var_name |
Name of variable where NAs are replaced |
with |
Value instead of NA |
Updated dataframe
data <- data.frame(nr = c(1,2,3,NA,NA)) replace_na_with(data, "nr", 0)
data <- data.frame(nr = c(1,2,3,NA,NA)) replace_na_with(data, "nr", 0)
Generate a report of all variables If target is defined, the relation to the target is reported
report(data, n, target, targetpct, split, color, output_file, output_dir)
report(data, n, target, targetpct, split, color, output_file, output_dir)
data |
A dataset |
n |
Weights variable for count data |
target |
Target variable (0/1 or |
targetpct |
Plot variable as target% ( |
split |
Alternative to targetpct (split = !targetpct) |
color |
User defined colors for plots (vector) |
output_file |
Filename of the html report |
output_dir |
Directory where to save the html report |
if (rmarkdown::pandoc_available("1.12.3")) { report(iris, output_dir = tempdir()) }
if (rmarkdown::pandoc_available("1.12.3")) { report(iris, output_dir = tempdir()) }
Rescales a numeric variable into values between 0 and 1
rescale01(x)
rescale01(x)
x |
numeric vector (to be rescaled) |
vector with values between 0 and 1
rescale01(0:10)
rescale01(0:10)
Show color vector as ggplot
show_color(color)
show_color(color)
color |
Vector of colors |
ggplot
show_color("gold") show_color(c("blue", "red", "green"))
show_color("gold") show_color(c("blue", "red", "green"))
A text string is converted into a simplified version by trimming, converting to upper case, replacing german Umlaute, dropping special characters like comma and semicolon and replacing multiple spaces with one space.
simplify_text(text)
simplify_text(text)
text |
text string |
text string
simplify_text(" Hello World !, ")
simplify_text(" Hello World !, ")
Create a plot to explore relation between categorical variable and a binary target
target_explore_cat( data, var, target = "target_ind", min_val = NA, max_val = NA, flip = TRUE, num2char = TRUE, title = NA, auto_scale = TRUE, na = NA, max_cat = 25, color = c("#ECEFF1", "#CFD8DC", "#B0BEC5", "#90A4AE"), legend_position = "bottom" )
target_explore_cat( data, var, target = "target_ind", min_val = NA, max_val = NA, flip = TRUE, num2char = TRUE, title = NA, auto_scale = TRUE, na = NA, max_cat = 25, color = c("#ECEFF1", "#CFD8DC", "#B0BEC5", "#90A4AE"), legend_position = "bottom" )
data |
A dataset |
var |
Categorical variable |
target |
Target variable (0/1 or FALSE/TRUE) |
min_val |
All values < min_val are converted to min_val |
max_val |
All values > max_val are converted to max_val |
flip |
Should plot be flipped? (change of x and y) |
num2char |
If TRUE, numeric values in variable are converted into character |
title |
Title of plot |
auto_scale |
Not used, just for compatibility |
na |
Value to replace NA |
max_cat |
Maximum numbers of categories to be plotted |
color |
Color vector (4 colors) |
legend_position |
Position of legend ("right"|"bottom"|"non") |
Plot object
Create a plot to explore relation between numerical variable and a binary target
target_explore_num( data, var, target = "target_ind", min_val = NA, max_val = NA, bins = 10, flip = TRUE, title = NA, auto_scale = TRUE, na = NA, color = c("#ECEFF1", "#CFD8DC", "#B0BEC5", "#90A4AE"), legend_position = "bottom" )
target_explore_num( data, var, target = "target_ind", min_val = NA, max_val = NA, bins = 10, flip = TRUE, title = NA, auto_scale = TRUE, na = NA, color = c("#ECEFF1", "#CFD8DC", "#B0BEC5", "#90A4AE"), legend_position = "bottom" )
data |
A dataset |
var |
Numerical variable |
target |
Target variable (0/1 or FALSE/TRUE) |
min_val |
All values < min_val are converted to min_val |
max_val |
All values > max_val are converted to max_val |
bins |
Nuber of bins |
flip |
Should plot be flipped? (change of x and y) |
title |
Title of plot |
auto_scale |
Use 0.02 and 0.98 quantile for min_val and max_val (if min_val and max_val are not defined) |
na |
Value to replace NA |
color |
Color vector (4 colors) |
legend_position |
Position of legend ("right"|"bottom"|"non") |
Plot object
Get fig.height for RMarkdown-junk using explore_all()
total_fig_height( data, var_name_n, var_name_target, nvar = NA, ncol = 2, size = 3 )
total_fig_height( data, var_name_n, var_name_target, nvar = NA, ncol = 2, size = 3 )
data |
A dataset |
var_name_n |
Weights variable for count data? (TRUE / MISSING) |
var_name_target |
Target variable (TRUE / MISSING) |
nvar |
Number of variables to plot |
ncol |
Number of columns (default = 2) |
size |
fig.height of 1 plot (default = 3) |
Number of rows
total_fig_height(iris) total_fig_height(iris, var_name_target = "Species") total_fig_height(nvar = 5)
total_fig_height(iris) total_fig_height(iris, var_name_target = "Species") total_fig_height(nvar = 5)
This data set is an incomplete collection of popular beers in Austria, Germany and Switzerland. Data are collected from various websites in 2023. Some of the collected data may be incorrect.
use_data_beer()
use_data_beer()
Dataset as tibble
use_data_beer()
use_data_beer()
This data set comes with the ggplot2 package. It contains the prices and other attributes of almost 54,000 diamonds.
use_data_diamonds()
use_data_diamonds()
Dataset
use_data_diamonds()
use_data_diamonds()
This data set comes with base R. The data set gives the measurements in centimeters of the variables sepal length and width and petal length and width, respectively, for 50 flowers from each of 3 species of iris. The species are Iris setosa, versicolor, and virginica.
use_data_iris()
use_data_iris()
Dataset as tibble
use_data_iris()
use_data_iris()
This data set comes with the ggplot2 package. It contains a subset of the fuel economy data that the EPA makes available on https://fueleconomy.gov/. It contains only models which had a new release every year between 1999 and 2008 - this was used as a proxy for the popularity of the car.
use_data_mpg()
use_data_mpg()
Dataset
use_data_mpg()
use_data_mpg()
This data set comes with base R. The data was extracted from the 1974 Motor Trend US magazine, and comprises fuel consumption and 10 aspects of automobile design and performance for 32 automobiles (1973–74 models).
use_data_mtcars()
use_data_mtcars()
Dataset
use_data_mtcars()
use_data_mtcars()
This data set comes with the palmerpenguins package. It contains measurements for penguin species, island in Palmer Archipelago, size (flipper length, body mass, bill dimensions), and sex.
use_data_penguins()
use_data_penguins()
Dataset
use_data_penguins()
use_data_penguins()
This data set comes with the dplyr package. It contains data of 87 star war characters
use_data_starwars()
use_data_starwars()
Dataset
use_data_starwars()
use_data_starwars()
This data set comes with base R. Survival of passengers on the Titanic.
use_data_titanic(count = FALSE)
use_data_titanic(count = FALSE)
count |
use count data |
Dataset
use_data_titanic(count = TRUE) use_data_titanic(count = FALSE)
use_data_titanic(count = TRUE) use_data_titanic(count = FALSE)
This data set contains the result of a real wordle challange (in german language) between tow players. Wordle is a game where a player guesses a five-letter word in six tries. The variable "try" reflects the success of player A and B. Other variables like "noun", "aeiou", "unique", "common" and "rare" reflect the properties of the word.
use_data_wordle()
use_data_wordle()
Dataset
use_data_wordle()
use_data_wordle()
Create weights for the target variable in your dataset so that are equal weights for target = 0 and target = 1. Target must be 0/1, FALSE/TRUE ore no/yes
weight_target(data, target)
weight_target(data, target)
data |
A dataset |
target |
Target variable (0/1, TRUE/FALSE, yes/no) |
Weights for each observation (as a vector)
iris$is_versicolor <- ifelse(iris$Species == "versicolor", 1, 0) weights <- weight_target(iris, target = is_versicolor) versicolor <- iris$is_versicolor table(versicolor, weights)
iris$is_versicolor <- ifelse(iris$Species == "versicolor", 1, 0) weights <- weight_target(iris, target = is_versicolor) versicolor <- iris$is_versicolor table(versicolor, weights)
Calculate with periods (format yyyymm)
yyyymm_calc(yyyymm, add_month = 0, add_year = 0)
yyyymm_calc(yyyymm, add_month = 0, add_year = 0)
yyyymm |
Input vector of periods (format yyyymm) |
add_month |
How many months to add (can be negative too) |
add_year |
How many years to add (can be negative too) |
Vector of periods (format yyyymm)
yyyymm_calc(202412, add_month = 1) yyyymm_calc(c(202411,202412,202501), add_month = -1, add_year = 1)
yyyymm_calc(202412, add_month = 1) yyyymm_calc(c(202411,202412,202501), add_month = -1, add_year = 1)