Skip to contents

Data cube creation (dcc)

Usage

dcc(.data, .variables, .fun = jointfun_, ...)

dcc2(.data, .variables, .fun = jointfun_, order_type = extract_unique2, ...)

dcc5(
  .data,
  .variables,
  .fun = jointfun_,
  .total = "Totale",
  order_type = extract_unique4,
  .all = TRUE,
  ...
)

Arguments

.data

data frame to be processed

.variables

variables to split data frame by, as a character vector (c("var1", "var2")).

.fun

function to apply to each piece (default: jointfun_)

...

additional functions passed to .fun.

order_type

a function like extract_unique or extract_unique2.

.total

character string with the name to give to the subset of data that includes all the observations of a variable (default: "Totale").

.all

logical, indicating if functions' have to be evaluated on the complete dataset.

Value

a data cube, with a column for each cateogorical variable used, and a row for each combination of all the categorical variables' modalities. In addition to all the modalities, each variable will also have a "Total" possibility, which includes all the others. The data cube will contain marginal, conditional and joint empirical distributions...

Examples

data("invented_wages")
str(invented_wages)
#> tibble [1,000 × 5] (S3: tbl_df/tbl/data.frame)
#>  $ gender        : Factor w/ 2 levels "men","women": 1 2 1 2 1 1 1 2 2 2 ...
#>  $ sector        : Factor w/ 2 levels "secondary","tertiary": 2 1 2 2 1 1 2 1 2 1 ...
#>  $ education     : Factor w/ 3 levels "I","II","III": 3 2 2 2 2 1 3 1 2 2 ...
#>  $ wage          : num [1:1000] 8400 4200 5100 7400 4300 4900 5400 2900 4500 3000 ...
#>  $ sample_weights: num [1:1000] 105 32 36 12 21 46 79 113 34 32 ...
tmp <- dcc(
  .data = invented_wages, 
  .variables = c("gender", "sector"), 
  .fun = jointfun_
)
#> 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |======================================================================| 100%
tmp
#> # A tibble: 9 × 3
#>   gender sector        n
#> * <fct>  <fct>     <int>
#> 1 Totale Totale     1000
#> 2 Totale secondary   455
#> 3 Totale tertiary    545
#> 4 men    Totale      547
#> 5 men    secondary   289
#> 6 men    tertiary    258
#> 7 women  Totale      453
#> 8 women  secondary   166
#> 9 women  tertiary    287
str(tmp)
#> tibble [9 × 3] (S3: tbl_df/tbl/data.frame)
#>  $ gender: Factor w/ 3 levels "Totale","men",..: 1 1 1 2 2 2 3 3 3
#>  $ sector: Factor w/ 3 levels "Totale","secondary",..: 1 2 3 1 2 3 1 2 3
#>  $ n     : int [1:9] 1000 455 545 547 289 258 453 166 287
#>  - attr(*, ".variables")= chr [1:2] "gender" "sector"
tmp2 <- dcc2(
  .data = invented_wages, 
  .variables = c("gender", "education"), 
  .fun = jointfun_, 
  order_type = extract_unique2
)
#> 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |======================================================================| 100%
tmp2
#> # A tibble: 12 × 3
#>    gender education     n
#>  * <fct>  <fct>     <int>
#>  1 Totale Totale     1000
#>  2 Totale I           172
#>  3 Totale II          719
#>  4 Totale III         109
#>  5 men    Totale      547
#>  6 men    I            60
#>  7 men    II          409
#>  8 men    III          78
#>  9 women  Totale      453
#> 10 women  I           112
#> 11 women  II          310
#> 12 women  III          31
str(tmp2)
#> tibble [12 × 3] (S3: tbl_df/tbl/data.frame)
#>  $ gender   : Factor w/ 3 levels "Totale","men",..: 1 1 1 1 2 2 2 2 3 3 ...
#>  $ education: Factor w/ 4 levels "Totale","I","II",..: 1 2 3 4 1 2 3 4 1 2 ...
#>  $ n        : int [1:12] 1000 172 719 109 547 60 409 78 453 112 ...
#>  - attr(*, ".variables")= chr [1:2] "gender" "education"

# dcc5 works like dcc2, but has an additional optional argument, .total,
# that can be added to give a name to the groups that include all the 
# observations of a variable.
tmp5 <- dcc5(
  .data = invented_wages, 
  .variables = c("gender", "education"),
  .fun = jointfun_,
  .total = "TOTAL",
  order_type = extract_unique2
)
#> 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |======================================================================| 100%
tmp5
#> # A tibble: 12 × 3
#>    gender education     n
#>  * <fct>  <fct>     <int>
#>  1 TOTAL  TOTAL      1000
#>  2 TOTAL  I           172
#>  3 TOTAL  II          719
#>  4 TOTAL  III         109
#>  5 men    TOTAL       547
#>  6 men    I            60
#>  7 men    II          409
#>  8 men    III          78
#>  9 women  TOTAL       453
#> 10 women  I           112
#> 11 women  II          310
#> 12 women  III          31