# WEBIN-R #16 : Analyses de séquences
donnees <- read_csv("trajpro.csv")
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double()
## )
## i Use `spec()` for the full column specifications.
donnees$generation <- factor(
levels = 1:3,
labels = c("1930-1938", "1939-1945", "1946-1950")
labels <- c(
agric = "agriculteurs exploitants", # 1
acce = "artisans, commercants et chefs d'entreprise", # 2
cadr = "cadres et professions intellectuelles supérieures", # 3
pint = "professions intermédiaires", # 4
empl = "employés", # 5
ouvr = "ouvriers", # 6
etud = "études", # 7
inact = "inactivité", # 8
smil = "service militaire" # 9
seq <- seqdef(
donnees %>% select(csp1:csp37),
alphabet = 1:9,
states = names(labels),
labels = names(labels)
## [>] state coding:
## [alphabet] [label] [long label]
## 1 1agric agric
## 2 2acce acce
## 3 3cadr cadr
## 4 4pint pint
## 5 5empl empl
## 6 6ouvr ouvr
## 7 7etud etud
## 8 8inact inact
## 9 9smil smil
## [>] 1000 sequences in the data set
## [>] min/max sequence length: 37/37
seq.om <- seqdist(seq, method = "LCS") %>%
## [>] 1000 sequences with 9 distinct states
## [>] creating a 'sm' with a substitution cost of 2
## [>] creating 9x9 substitution-cost matrix using 2 as constant value
## [>] 818 distinct sequences
## [>] min/max sequence lengths: 37/37
## [>] computing distances using the LCS metric
## [>] elapsed time: 1.88 secs
seq.arbre <- hclust(seq.om, method = "ward.D2")
seq.arbre$height %>%
sort(decreasing = TRUE) %>%
head(20) %>%
plot(type = "s")
seq.part <- cutree(seq.arbre, k = 5) %>%
factor(levels = 1:5, labels = paste("Classe", 1:5))
## n % val%
## Classe 1 366 36.6 36.6
## Classe 2 183 18.3 18.3
## Classe 3 104 10.4 10.4
## Classe 4 296 29.6 29.6
## Classe 5 51 5.1 5.1
seqdplot(seq, group = seq.part, xtlab = 14:50)
ordre <- seq.om %>% cmdscale(k = 1)
seqIplot(seq, group = seq.part, xtlab = 14:50, sortv = ordre)
seq_heatmap(seq, seq.arbre, labCol = 14:50)
donnees$id <- 1:nrow(donnees)
donnees$classe <- seq.part
donnees$ordre <- ordre %>% rank(ties.method = "random")
long <- donnees %>%
cols = csp1:csp37,
names_to = "annee",
values_to = "csp"
long$csp <- factor(
levels = 1:9,
labels = labels
long$age <- long$annee %>%
str_sub(start = 4) %>%
as.integer() + 13
ggplot(long) +
aes(x = age, y = factor(ordre), fill = csp) +
geom_raster() +
theme_bw() +
scale_fill_brewer(palette = "Set3") +
facet_grid(rows = vars(classe), scales = "free_y", space = "free_y") +
scale_y_discrete(label = NULL) +
limits = c(14, 50),
breaks = c(14, 20, 25, 30, 35, 40, 45, 50)
## Warning: Removed 2000 rows containing missing values (geom_raster).
seqfplot(seq, group = seq.part, xtlab = 14:50)
seqmsplot(seq, group = seq.part, xtlab = 14:50)
seqmtplot(seq, group = seq.part, xtlab = 14:50)
seqHtplot(seq, group = seq.part, xtlab = 14:50)
donnees %>%
select(generation, classe) %>%
tbl_summary(by = classe, percent = "row") %>%
## Warning: The `.dots` argument of `group_by()` is deprecated as of dplyr 1.0.0.
Characteristic | Classe 1, N = 3661 | Classe 2, N = 1831 | Classe 3, N = 1041 | Classe 4, N = 2961 | Classe 5, N = 511 | p-value2 |
generation | 0.018 | |||||
1930-1938 | 121 (36%) | 67 (20%) | 22 (6.5%) | 108 (32%) | 22 (6.5%) | |
1939-1945 | 96 (33%) | 54 (18%) | 41 (14%) | 86 (29%) | 18 (6.1%) | |
1946-1950 | 149 (41%) | 62 (17%) | 41 (11%) | 102 (28%) | 11 (3.0%) | |
n (%)
Pearson's Chi-squared test