Data cube creation (dcc)

dcc(.data, .variables, .fun = jointfun_, ...)

dcc2(.data, .variables, .fun = jointfun_, order_type = extract_unique2, ...)

dcc5(
  .data,
  .variables,
  .fun = jointfun_,
  .total = "Totale",
  order_type = extract_unique4,
  .all = TRUE,
  ...
)

Arguments

.data

data frame to be processed

.variables

variables to split data frame by, as a character vector (c("var1", "var2")).

.fun

function to apply to each piece (default: jointfun_)

...

additional functions passed to .fun.

order_type

a function like extract_unique or extract_unique2.

.total

character string with the name to give to the subset of data that includes all the observations of a variable (default: "Totale").

.all

logical, indicating if functions' have to be evaluated on the complete dataset.

Value

a data cube, with a column for each cateogorical variable used, and a row for each combination of all the categorical variables' modalities. In addition to all the modalities, each variable will also have a "Total" possibility, which includes all the others. The data cube will contain marginal, conditional and joint empirical distributions...

Examples

data("invented_wages") str(invented_wages)
#> tibble [1,000 × 5] (S3: tbl_df/tbl/data.frame) #> $ gender : Factor w/ 2 levels "men","women": 1 2 1 2 1 1 1 2 2 2 ... #> $ sector : Factor w/ 2 levels "secondary","tertiary": 2 1 2 2 1 1 2 1 2 1 ... #> $ education : Factor w/ 3 levels "I","II","III": 3 2 2 2 2 1 3 1 2 2 ... #> $ wage : num [1:1000] 8400 4200 5100 7400 4300 4900 5400 2900 4500 3000 ... #> $ sample_weights: num [1:1000] 105 32 36 12 21 46 79 113 34 32 ...
tmp <- dcc(.data = invented_wages, .variables = c("gender", "sector"), .fun = jointfun_) tmp
#> # A tibble: 9 x 3 #> gender sector n #> * <fct> <fct> <int> #> 1 Totale Totale 1000 #> 2 Totale secondary 455 #> 3 Totale tertiary 545 #> 4 men Totale 547 #> 5 men secondary 289 #> 6 men tertiary 258 #> 7 women Totale 453 #> 8 women secondary 166 #> 9 women tertiary 287
str(tmp)
#> tibble [9 × 3] (S3: tbl_df/tbl/data.frame) #> $ gender: Factor w/ 3 levels "Totale","men",..: 1 1 1 2 2 2 3 3 3 #> $ sector: Factor w/ 3 levels "Totale","secondary",..: 1 2 3 1 2 3 1 2 3 #> $ n : int [1:9] 1000 455 545 547 289 258 453 166 287 #> - attr(*, ".variables")= chr [1:2] "gender" "sector"
tmp2 <- dcc2(.data = invented_wages, .variables = c("gender", "education"), .fun = jointfun_, order_type = extract_unique2) tmp2
#> # A tibble: 12 x 3 #> gender education n #> * <fct> <fct> <int> #> 1 Totale Totale 1000 #> 2 Totale I 172 #> 3 Totale II 719 #> 4 Totale III 109 #> 5 men Totale 547 #> 6 men I 60 #> 7 men II 409 #> 8 men III 78 #> 9 women Totale 453 #> 10 women I 112 #> 11 women II 310 #> 12 women III 31
str(tmp2)
#> tibble [12 × 3] (S3: tbl_df/tbl/data.frame) #> $ gender : Factor w/ 3 levels "Totale","men",..: 1 1 1 1 2 2 2 2 3 3 ... #> $ education: Factor w/ 4 levels "Totale","I","II",..: 1 2 3 4 1 2 3 4 1 2 ... #> $ n : int [1:12] 1000 172 719 109 547 60 409 78 453 112 ... #> - attr(*, ".variables")= chr [1:2] "gender" "education"
# dcc5 works like dcc2, but has an additional optional argument, .total, # that can be added to give a name to the groups that include all the # observations of a variable. tmp5 <- dcc5(.data = invented_wages, .variables = c("gender", "education"), .fun = jointfun_, .total = "TOTAL", order_type = extract_unique2) tmp5
#> # A tibble: 12 x 3 #> gender education n #> * <fct> <fct> <int> #> 1 TOTAL TOTAL 1000 #> 2 TOTAL I 172 #> 3 TOTAL II 719 #> 4 TOTAL III 109 #> 5 men TOTAL 547 #> 6 men I 60 #> 7 men II 409 #> 8 men III 78 #> 9 women TOTAL 453 #> 10 women I 112 #> 11 women II 310 #> 12 women III 31