data() # list data in the `datasets` package
data(package="vcd") # in the vcd package
vcdExtra::datasets()
gives more detailed info on
datasets in a package. knitr::kable()
turns this into a
nicely formatted table.
vcdExtra::datasets("vcd") |> knitr::kable()
Item | class | dim | Title |
---|---|---|---|
Arthritis | data.frame | 84x5 | Arthritis Treatment Data |
Baseball | data.frame | 322x25 | Baseball Data |
BrokenMarriage | data.frame | 20x4 | Broken Marriage Data |
Bundesliga | data.frame | 14018x7 | Ergebnisse der Fussball-Bundesliga |
Bundestag2005 | table | 16x5 | Votes in German Bundestag Election 2005 |
Butterfly | table | 24 | Butterfly Species in Malaya |
CoalMiners | table | 2x2x9 | Breathlessness and Wheeze in Coal Miners |
DanishWelfare | data.frame | 180x5 | Danish Welfare Study Data |
Employment | table | 2x6x2 | Employment Status |
Federalist | table | 7 | ‘May’ in Federalist Papers |
Hitters | data.frame | 154x4 | Hitters Data |
HorseKicks | table | 5 | Death by Horse Kicks |
Hospital | table | 3x3 | Hospital data |
JobSatisfaction | data.frame | 8x4 | Job Satisfaction Data |
JointSports | data.frame | 40x5 | Opinions About Joint Sports |
Lifeboats | data.frame | 18x8 | Lifeboats on the Titanic |
MSPatients | array | 4x4x2 | Diagnosis of Multiple Sclerosis |
NonResponse | data.frame | 12x4 | Non-Response Survey Data |
OvaryCancer | data.frame | 16x5 | Ovary Cancer Data |
PreSex | table | 2x2x2x2 | Pre-marital Sex and Divorce |
Punishment | data.frame | 36x5 | Corporal Punishment Data |
RepVict | table | 8x8 | Repeat Victimization Data |
Rochdale | table | 2x2x2x2x2x2x2x2 | Rochdale Data |
Saxony | table | 13 | Families in Saxony |
SexualFun | table | 4x4 | Sex is Fun |
SpaceShuttle | data.frame | 24x6 | Space Shuttle O-ring Failures |
Suicide | data.frame | 306x6 | Suicide Rates in Germany |
Trucks | data.frame | 24x5 | Truck Accidents Data |
UKSoccer | table | 5x5 | UK Soccer Scores |
VisualAcuity | data.frame | 32x4 | Visual Acuity in Left and Right Eyes |
VonBort | data.frame | 280x4 | Von Bortkiewicz Horse Kicks Data |
WeldonDice | table | 11 | Weldon’s Dice Data |
WomenQueue | table | 11 | Women in Queues |
Typically, load data from a package using data()
data(UCBAdmissions)
str(UCBAdmissions)
## 'table' num [1:2, 1:2, 1:6] 512 313 89 19 353 207 17 8 120 205 ...
## - attr(*, "dimnames")=List of 3
## ..$ Admit : chr [1:2] "Admitted" "Rejected"
## ..$ Gender: chr [1:2] "Male" "Female"
## ..$ Dept : chr [1:6] "A" "B" "C" "D" ...
sum(UCBAdmissions)
## [1] 4526
margin.table(UCBAdmissions, 1)
## Admit
## Admitted Rejected
## 1755 2771
margin.table(UCBAdmissions, 2:3)
## Dept
## Gender A B C D E F
## Male 825 560 325 417 191 373
## Female 108 25 593 375 393 341
GSS <- data.frame(
sex = rep(c("female", "male"), times = 3),
party = rep(c("dem", "indep", "rep"), each = 2),
count = c(279,165,73,47,225,191))
GSS
## sex party count
## 1 female dem 279
## 2 male dem 165
## 3 female indep 73
## 4 male indep 47
## 5 female rep 225
## 6 male rep 191
data(Arthritis, package = "vcd")
str(Arthritis)
## 'data.frame': 84 obs. of 5 variables:
## $ ID : int 57 46 77 17 36 23 75 39 33 55 ...
## $ Treatment: Factor w/ 2 levels "Placebo","Treated": 2 2 2 2 2 2 2 2 2 2 ...
## $ Sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
## $ Age : int 27 29 30 32 46 58 59 59 63 63 ...
## $ Improved : Ord.factor w/ 3 levels "None"<"Some"<..: 2 1 1 3 3 3 1 3 1 1 ...
head(Arthritis) # see the first few lines
## ID Treatment Sex Age Improved
## 1 57 Treated Male 27 Some
## 2 46 Treated Male 29 None
## 3 77 Treated Male 30 None
## 4 17 Treated Male 32 Marked
## 5 36 Treated Male 46 Marked
## 6 23 Treated Male 58 Marked
table(Arthritis$Improved)
##
## None Some Marked
## 42 14 28
table(Arthritis$Treatment, Arthritis$Sex)
##
## Female Male
## Placebo 32 11
## Treated 27 14
a shorthand to avoid repeating the name of the dataset
with(Arthritis, table(Treatment, Sex))
## Sex
## Treatment Female Male
## Placebo 32 11
## Treated 27 14
xtabs() is often easier
art.table <- xtabs(~ Sex + Treatment + Improved, data=Arthritis)
ftable(art.table) # display as flattened table
## Improved None Some Marked
## Sex Treatment
## Female Placebo 19 7 6
## Treated 6 5 16
## Male Placebo 10 0 1
## Treated 7 2 5
summary(art.table) # chi-square test for mutual independence
## Call: xtabs(formula = ~Sex + Treatment + Improved, data = Arthritis)
## Number of cases in table: 84
## Number of factors: 3
## Test for independence of all factors:
## Chisq = 19.6, df = 7, p-value = 0.006501
## Chi-squared approximation may be incorrect
plot(art.table, shade=TRUE)
read a data table from a local file (NB: ‘/’ not ’' for all systems)
arthritis <- read.csv(“N:/psy6136/data/arthritis.csv”) arthritis
<- read.csv(file.choose())
or, read the same data from a web URL …
arthritis <- read.csv("https://raw.githubusercontent.com/friendly/psy6136/master/data/Arthritis.csv")
str(arthritis)
## 'data.frame': 84 obs. of 5 variables:
## $ ID : int 57 46 77 17 36 23 75 39 33 55 ...
## $ Treatment: chr "Treated" "Treated" "Treated" "Treated" ...
## $ Sex : chr "Male" "Male" "Male" "Male" ...
## $ Age : int 27 29 30 32 46 58 59 59 63 63 ...
## $ Improved : chr "Some" "None" "None" "Marked" ...
read.csv
doesn’t make categorical variables factors.
They are just character variables.
levels(arthritis$Improved)
## NULL
arthritis$Improved <- ordered(arthritis$Improved,
levels=c("None", "Some", "Marked"))
cut()
and arithmetictable(10*floor(arthritis$Age/10))
##
## 20 30 40 50 60 70
## 4 11 11 29 26 3
table(cut(arthritis$Age, breaks=6))
##
## (22.9,31.5] (31.5,40] (40,48.5] (48.5,57] (57,65.5] (65.5,74.1]
## 8 7 10 19 26 14
arthritis$AgeGroup <- factor(10*floor(arthritis$Age/10))
plot(arthritis$Age) # index plot
plot(arthritis$AgeGroup) # barplot for a factor
plot(Improved ~ AgeGroup, data=arthritis) # spineplot for two factors
plot(arthritis[,2:5]) # scatterplot matrix; not too useful for factors