Function to calculate groupwise summary statistics

Function to calculate groupwise summary statistics, much like the summary procedure of SAS

summary_by(
  data,
  formula,
  id = NULL,
  FUN = mean,
  keep.names = FALSE,
  p2d = FALSE,
  order = TRUE,
  full.dimension = FALSE,
  var.names = NULL,
  fun.names = NULL,
  ...
)

summaryBy(
  formula,
  data = parent.frame(),
  id = NULL,
  FUN = mean,
  keep.names = FALSE,
  p2d = FALSE,
  order = TRUE,
  full.dimension = FALSE,
  var.names = NULL,
  fun.names = NULL,
  ...
)

Arguments

data: A data frame.
formula: A formula object, see examples below.
id: A formula specifying variables which data are not grouped by but which should appear in the output. See examples below.
FUN: A list of functions to be applied, see examples below.
keep.names: If TRUE and if there is only ONE function in FUN, then the variables in the output will have the same name as the variables in the input, see 'examples'.
p2d: Should parentheses in output variable names be replaced by dots?
order: Should the resulting dataframe be ordered according to the variables on the right hand side of the formula? (using orderBy
full.dimension: If TRUE then rows of summary statistics are repeated such that the result will have the same number of rows as the input dataset.
var.names: Option for user to specify the names of the variables on the left hand side.
fun.names: Option for user to specify function names to apply to the variables on the left hand side.
...: Additional arguments to FUN. This could for example be NA actions.

Value

A dataframe.

Details

Extra arguments (...) are passed onto the functions in FUN. Hence care must be taken that all functions in FUN accept these arguments - OR one can explicitly write a functions which get around this. This can particularly be an issue in connection with handling NAs. See examples below. Some code for this function has been suggested by Jim Robison-Cox. Thanks.

Author

Søren Højsgaard, sorenh@math.aau.dk

Examples


data(dietox)
dietox12    <- subset(dietox,Time==12)

fun <- function(x){
  c(m=mean(x), v=var(x), n=length(x))
}

summaryBy(cbind(Weight, Feed) ~ Evit + Cu, data=dietox12,
          FUN=fun)
#>      Evit    Cu  Weight.m  Weight.v Weight.n   Feed.m     Feed.v Feed.n
#> 1 Evit000 Cu000 100.41426  46.41481        7 167.2571   86.40620      7
#> 2 Evit000 Cu035  98.51247  65.80137        8 171.7875  283.20408      8
#> 3 Evit000 Cu175  99.18748 235.70414        8 173.4625 1230.17117      8
#> 4 Evit100 Cu000 102.15712  52.31616        7 162.2000  174.98997      7
#> 5 Evit100 Cu035 102.47498  92.57374        8 178.2250  843.54503      8
#> 6 Evit100 Cu175 105.04283  55.49632        7 182.0143  810.98475      7
#> 7 Evit200 Cu000  97.24283  58.28952        7 160.3571   95.82953      7
#> 8 Evit200 Cu035  95.55553  33.90778        9 164.7667  267.88254      9
#> 9 Evit200 Cu175  98.63748 114.85415        8 168.4250  624.31074      8

summaryBy(list(c("Weight", "Feed"), c("Evit", "Cu")), data=dietox12,
          FUN=fun)
#>      Evit    Cu  Weight.m  Weight.v Weight.n   Feed.m     Feed.v Feed.n
#> 1 Evit000 Cu000 100.41426  46.41481        7 167.2571   86.40620      7
#> 2 Evit000 Cu035  98.51247  65.80137        8 171.7875  283.20408      8
#> 3 Evit000 Cu175  99.18748 235.70414        8 173.4625 1230.17117      8
#> 4 Evit100 Cu000 102.15712  52.31616        7 162.2000  174.98997      7
#> 5 Evit100 Cu035 102.47498  92.57374        8 178.2250  843.54503      8
#> 6 Evit100 Cu175 105.04283  55.49632        7 182.0143  810.98475      7
#> 7 Evit200 Cu000  97.24283  58.28952        7 160.3571   95.82953      7
#> 8 Evit200 Cu035  95.55553  33.90778        9 164.7667  267.88254      9
#> 9 Evit200 Cu175  98.63748 114.85415        8 168.4250  624.31074      8

## Computations on several variables is done using cbind( )
summaryBy(cbind(Weight, Feed) ~ Evit + Cu, data=subset(dietox, Time > 1),
   FUN=fun)
#>      Evit    Cu Weight.m Weight.v Weight.n   Feed.m   Feed.v Feed.n
#> 1 Evit000 Cu000 64.49219 545.3481       77 79.37792 2705.516     77
#> 2 Evit000 Cu035 62.53862 558.7867       88 81.19432 2852.290     88
#> 3 Evit000 Cu175 64.51362 603.8636       88 84.37955 3158.985     88
#> 4 Evit100 Cu000 64.70803 553.4286       87 77.46552 2434.824     87
#> 5 Evit100 Cu035 64.38976 595.2586       88 81.04773 3159.666     88
#> 6 Evit100 Cu175 67.81033 578.9620       87 86.72299 3129.720     87
#> 7 Evit200 Cu000 61.40113 537.8238       87 76.35517 2409.296     87
#> 8 Evit200 Cu035 61.68483 504.8144       99 79.13939 2619.280     99
#> 9 Evit200 Cu175 64.16135 562.3279       88 80.88636 2874.140     88

## Calculations on transformed data is possible using cbind( ), but
# the transformed variables must be named

summaryBy(cbind(lw=log(Weight), Feed) ~ Evit + Cu, data=dietox12, FUN=mean)
#>      Evit    Cu  lw.mean Feed.mean
#> 1 Evit000 Cu000 4.607344  167.2571
#> 2 Evit000 Cu035 4.587091  171.7875
#> 3 Evit000 Cu175 4.584574  173.4625
#> 4 Evit100 Cu000 4.624415  162.2000
#> 5 Evit100 Cu035 4.625384  178.2250
#> 6 Evit100 Cu175 4.652181  182.0143
#> 7 Evit200 Cu000 4.574510  160.3571
#> 8 Evit200 Cu035 4.558066  164.7667
#> 9 Evit200 Cu175 4.585830  168.4250
 
## There are missing values in the 'airquality' data, so we remove these
## before calculating mean and variance with 'na.rm=TRUE'. However the
## length function does not accept any such argument. Hence we get
## around this by defining our own summary function in which length is
## not supplied with this argument while mean and var are:

sumfun <- function(x, ...){
  c(m=mean(x, na.rm=TRUE, ...), v=var(x, na.rm=TRUE, ...), l=length(x))
}
summaryBy(cbind(Ozone, Solar.R) ~ Month, data=airquality, FUN=sumfun)
#>   Month  Ozone.m   Ozone.v Ozone.l Solar.R.m Solar.R.v Solar.R.l
#> 1     5 23.61538  493.9262      31  181.2963 13242.370        31
#> 2     6 29.44444  331.5278      30  190.1667  8627.247        30
#> 3     7 59.11538 1000.8262      31  216.4839  6491.258        31
#> 4     8 59.96154 1574.5985      31  171.8571  5903.608        31
#> 5     9 31.44828  582.8276      30  167.4333  6259.702        30
## Compare with
aggregate(cbind(Ozone, Solar.R) ~ Month, data=airquality, FUN=sumfun)
#>   Month    Ozone.m    Ozone.v    Ozone.l  Solar.R.m  Solar.R.v  Solar.R.l
#> 1     5   24.12500  523.76630   24.00000   182.0417 14072.1286    24.0000
#> 2     6   29.44444  331.52778    9.00000   184.2222  9159.6944     9.0000
#> 3     7   59.11538 1000.82615   26.00000   216.4231  6478.2538    26.0000
#> 4     8   60.00000 1744.54545   23.00000   173.0870  6215.0830    23.0000
#> 5     9   31.44828  582.82759   29.00000   168.2069  6464.6700    29.0000

## Using '.' on the right hand side of a formula means to stratify by
## all variables not used elsewhere:

data(warpbreaks)
summaryBy(breaks ~ wool + tension, warpbreaks, FUN=mean)
#>   wool tension breaks.mean
#> 1    A       L    44.55556
#> 2    A       M    24.00000
#> 3    A       H    24.55556
#> 4    B       L    28.22222
#> 5    B       M    28.77778
#> 6    B       H    18.77778
summaryBy(breaks ~ ., warpbreaks, FUN=mean)
#>   wool tension breaks.mean
#> 1    A       L    44.55556
#> 2    A       M    24.00000
#> 3    A       H    24.55556
#> 4    B       L    28.22222
#> 5    B       M    28.77778
#> 6    B       H    18.77778
summaryBy(. ~ wool + tension, warpbreaks, FUN=mean)
#>   wool tension breaks.mean
#> 1    A       L    44.55556
#> 2    A       M    24.00000
#> 3    A       H    24.55556
#> 4    B       L    28.22222
#> 5    B       M    28.77778
#> 6    B       H    18.77778

summaryBy(. ~ wool + tension, warpbreaks, FUN=mean)
#>   wool tension breaks.mean
#> 1    A       L    44.55556
#> 2    A       M    24.00000
#> 3    A       H    24.55556
#> 4    B       L    28.22222
#> 5    B       M    28.77778
#> 6    B       H    18.77778