Function to calculate groupwise summary statistics, much like the summary procedure of SAS

summary_by(
  data,
  formula,
  id = NULL,
  FUN = mean,
  keep.names = FALSE,
  p2d = FALSE,
  order = TRUE,
  full.dimension = FALSE,
  var.names = NULL,
  fun.names = NULL,
  ...
)

summaryBy(
  formula,
  data = parent.frame(),
  id = NULL,
  FUN = mean,
  keep.names = FALSE,
  p2d = FALSE,
  order = TRUE,
  full.dimension = FALSE,
  var.names = NULL,
  fun.names = NULL,
  ...
)

Arguments

data

A data frame.

formula

A formula object, see examples below.

id

A formula specifying variables which data are not grouped by but which should appear in the output. See examples below.

FUN

A list of functions to be applied, see examples below.

keep.names

If TRUE and if there is only ONE function in FUN, then the variables in the output will have the same name as the variables in the input, see 'examples'.

p2d

Should parentheses in output variable names be replaced by dots?

order

Should the resulting dataframe be ordered according to the variables on the right hand side of the formula? (using orderBy

full.dimension

If TRUE then rows of summary statistics are repeated such that the result will have the same number of rows as the input dataset.

var.names

Option for user to specify the names of the variables on the left hand side.

fun.names

Option for user to specify function names to apply to the variables on the left hand side.

...

Additional arguments to FUN. This could for example be NA actions.

Value

A dataframe.

Details

Extra arguments (...) are passed onto the functions in FUN. Hence care must be taken that all functions in FUN accept these arguments - OR one can explicitly write a functions which get around this. This can particularly be an issue in connection with handling NAs. See examples below. Some code for this function has been suggested by Jim Robison-Cox. Thanks.

Author

Søren Højsgaard, sorenh@math.aau.dk

Examples


data(dietox)
dietox12    <- subset(dietox,Time==12)

fun <- function(x){
  c(m=mean(x), v=var(x), n=length(x))
}

summaryBy(cbind(Weight, Feed) ~ Evit + Cu, data=dietox12,
          FUN=fun)
#>      Evit    Cu  Weight.m  Weight.v Weight.n   Feed.m     Feed.v Feed.n
#> 1 Evit000 Cu000 100.41426  46.41481        7 167.2571   86.40620      7
#> 2 Evit000 Cu035  98.51247  65.80137        8 171.7875  283.20408      8
#> 3 Evit000 Cu175  99.18748 235.70414        8 173.4625 1230.17117      8
#> 4 Evit100 Cu000 102.15712  52.31616        7 162.2000  174.98997      7
#> 5 Evit100 Cu035 102.47498  92.57374        8 178.2250  843.54503      8
#> 6 Evit100 Cu175 105.04283  55.49632        7 182.0143  810.98475      7
#> 7 Evit200 Cu000  97.24283  58.28952        7 160.3571   95.82953      7
#> 8 Evit200 Cu035  95.55553  33.90778        9 164.7667  267.88254      9
#> 9 Evit200 Cu175  98.63748 114.85415        8 168.4250  624.31074      8

summaryBy(list(c("Weight", "Feed"), c("Evit", "Cu")), data=dietox12,
          FUN=fun)
#>      Evit    Cu  Weight.m  Weight.v Weight.n   Feed.m     Feed.v Feed.n
#> 1 Evit000 Cu000 100.41426  46.41481        7 167.2571   86.40620      7
#> 2 Evit000 Cu035  98.51247  65.80137        8 171.7875  283.20408      8
#> 3 Evit000 Cu175  99.18748 235.70414        8 173.4625 1230.17117      8
#> 4 Evit100 Cu000 102.15712  52.31616        7 162.2000  174.98997      7
#> 5 Evit100 Cu035 102.47498  92.57374        8 178.2250  843.54503      8
#> 6 Evit100 Cu175 105.04283  55.49632        7 182.0143  810.98475      7
#> 7 Evit200 Cu000  97.24283  58.28952        7 160.3571   95.82953      7
#> 8 Evit200 Cu035  95.55553  33.90778        9 164.7667  267.88254      9
#> 9 Evit200 Cu175  98.63748 114.85415        8 168.4250  624.31074      8

## Computations on several variables is done using cbind( )
summaryBy(cbind(Weight, Feed) ~ Evit + Cu, data=subset(dietox, Time > 1),
   FUN=fun)
#>      Evit    Cu Weight.m Weight.v Weight.n   Feed.m   Feed.v Feed.n
#> 1 Evit000 Cu000 64.49219 545.3481       77 79.37792 2705.516     77
#> 2 Evit000 Cu035 62.53862 558.7867       88 81.19432 2852.290     88
#> 3 Evit000 Cu175 64.51362 603.8636       88 84.37955 3158.985     88
#> 4 Evit100 Cu000 64.70803 553.4286       87 77.46552 2434.824     87
#> 5 Evit100 Cu035 64.38976 595.2586       88 81.04773 3159.666     88
#> 6 Evit100 Cu175 67.81033 578.9620       87 86.72299 3129.720     87
#> 7 Evit200 Cu000 61.40113 537.8238       87 76.35517 2409.296     87
#> 8 Evit200 Cu035 61.68483 504.8144       99 79.13939 2619.280     99
#> 9 Evit200 Cu175 64.16135 562.3279       88 80.88636 2874.140     88

## Calculations on transformed data is possible using cbind( ), but
# the transformed variables must be named

summaryBy(cbind(lw=log(Weight), Feed) ~ Evit + Cu, data=dietox12, FUN=mean)
#>      Evit    Cu  lw.mean Feed.mean
#> 1 Evit000 Cu000 4.607344  167.2571
#> 2 Evit000 Cu035 4.587091  171.7875
#> 3 Evit000 Cu175 4.584574  173.4625
#> 4 Evit100 Cu000 4.624415  162.2000
#> 5 Evit100 Cu035 4.625384  178.2250
#> 6 Evit100 Cu175 4.652181  182.0143
#> 7 Evit200 Cu000 4.574510  160.3571
#> 8 Evit200 Cu035 4.558066  164.7667
#> 9 Evit200 Cu175 4.585830  168.4250
 
## There are missing values in the 'airquality' data, so we remove these
## before calculating mean and variance with 'na.rm=TRUE'. However the
## length function does not accept any such argument. Hence we get
## around this by defining our own summary function in which length is
## not supplied with this argument while mean and var are:

sumfun <- function(x, ...){
  c(m=mean(x, na.rm=TRUE, ...), v=var(x, na.rm=TRUE, ...), l=length(x))
}
summaryBy(cbind(Ozone, Solar.R) ~ Month, data=airquality, FUN=sumfun)
#>   Month  Ozone.m   Ozone.v Ozone.l Solar.R.m Solar.R.v Solar.R.l
#> 1     5 23.61538  493.9262      31  181.2963 13242.370        31
#> 2     6 29.44444  331.5278      30  190.1667  8627.247        30
#> 3     7 59.11538 1000.8262      31  216.4839  6491.258        31
#> 4     8 59.96154 1574.5985      31  171.8571  5903.608        31
#> 5     9 31.44828  582.8276      30  167.4333  6259.702        30
## Compare with
aggregate(cbind(Ozone, Solar.R) ~ Month, data=airquality, FUN=sumfun)
#>   Month    Ozone.m    Ozone.v    Ozone.l  Solar.R.m  Solar.R.v  Solar.R.l
#> 1     5   24.12500  523.76630   24.00000   182.0417 14072.1286    24.0000
#> 2     6   29.44444  331.52778    9.00000   184.2222  9159.6944     9.0000
#> 3     7   59.11538 1000.82615   26.00000   216.4231  6478.2538    26.0000
#> 4     8   60.00000 1744.54545   23.00000   173.0870  6215.0830    23.0000
#> 5     9   31.44828  582.82759   29.00000   168.2069  6464.6700    29.0000

## Using '.' on the right hand side of a formula means to stratify by
## all variables not used elsewhere:

data(warpbreaks)
summaryBy(breaks ~ wool + tension, warpbreaks, FUN=mean)
#>   wool tension breaks.mean
#> 1    A       L    44.55556
#> 2    A       M    24.00000
#> 3    A       H    24.55556
#> 4    B       L    28.22222
#> 5    B       M    28.77778
#> 6    B       H    18.77778
summaryBy(breaks ~ ., warpbreaks, FUN=mean)
#>   wool tension breaks.mean
#> 1    A       L    44.55556
#> 2    A       M    24.00000
#> 3    A       H    24.55556
#> 4    B       L    28.22222
#> 5    B       M    28.77778
#> 6    B       H    18.77778
summaryBy(. ~ wool + tension, warpbreaks, FUN=mean)
#>   wool tension breaks.mean
#> 1    A       L    44.55556
#> 2    A       M    24.00000
#> 3    A       H    24.55556
#> 4    B       L    28.22222
#> 5    B       M    28.77778
#> 6    B       H    18.77778

summaryBy(. ~ wool + tension, warpbreaks, FUN=mean)
#>   wool tension breaks.mean
#> 1    A       L    44.55556
#> 2    A       M    24.00000
#> 3    A       H    24.55556
#> 4    B       L    28.22222
#> 5    B       M    28.77778
#> 6    B       H    18.77778