# WEBIN-R #16 : Analyses de séquences
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3.9000 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.5
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(labelled)
library(TraMineR)
##
## TraMineR stable version 2.2-1 (Built: 2020-11-02)
## Website: http://traminer.unige.ch
## Please type 'citation("TraMineR")' for citation information.
library(tidyr)
donnees <- read_csv("trajpro.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double()
## )
## i Use `spec()` for the full column specifications.
donnees$generation <- factor(
donnees$generation,
levels = 1:3,
labels = c("1930-1938", "1939-1945", "1946-1950")
)
labels <- c(
agric = "agriculteurs exploitants", # 1
acce = "artisans, commercants et chefs d'entreprise", # 2
cadr = "cadres et professions intellectuelles supérieures", # 3
pint = "professions intermédiaires", # 4
empl = "employés", # 5
ouvr = "ouvriers", # 6
etud = "études", # 7
inact = "inactivité", # 8
smil = "service militaire" # 9
)
seq <- seqdef(
donnees %>% select(csp1:csp37),
alphabet = 1:9,
states = names(labels),
labels = names(labels)
)
## [>] state coding:
## [alphabet] [label] [long label]
## 1 1agric agric
## 2 2acce acce
## 3 3cadr cadr
## 4 4pint pint
## 5 5empl empl
## 6 6ouvr ouvr
## 7 7etud etud
## 8 8inact inact
## 9 9smil smil
## [>] 1000 sequences in the data set
## [>] min/max sequence length: 37/37
seq.om <- seqdist(seq, method = "LCS") %>%
as.dist()
## [>] 1000 sequences with 9 distinct states
## [>] creating a 'sm' with a substitution cost of 2
## [>] creating 9x9 substitution-cost matrix using 2 as constant value
## [>] 818 distinct sequences
## [>] min/max sequence lengths: 37/37
## [>] computing distances using the LCS metric
## [>] elapsed time: 1.88 secs
seq.arbre <- hclust(seq.om, method = "ward.D2")
plot(seq.arbre)
seq.arbre$height %>%
sort(decreasing = TRUE) %>%
head(20) %>%
plot(type = "s")
seq.part <- cutree(seq.arbre, k = 5) %>%
factor(levels = 1:5, labels = paste("Classe", 1:5))
questionr::freq(seq.part)
## n % val%
## Classe 1 366 36.6 36.6
## Classe 2 183 18.3 18.3
## Classe 3 104 10.4 10.4
## Classe 4 296 29.6 29.6
## Classe 5 51 5.1 5.1
seqdplot(seq, group = seq.part, xtlab = 14:50)
ordre <- seq.om %>% cmdscale(k = 1)
seqIplot(seq, group = seq.part, xtlab = 14:50, sortv = ordre)
library(seqhandbook)
seq_heatmap(seq, seq.arbre, labCol = 14:50)
donnees$id <- 1:nrow(donnees)
donnees$classe <- seq.part
donnees$ordre <- ordre %>% rank(ties.method = "random")
long <- donnees %>%
pivot_longer(
cols = csp1:csp37,
names_to = "annee",
values_to = "csp"
)
long$csp <- factor(
long$csp,
levels = 1:9,
labels = labels
)
long$age <- long$annee %>%
str_sub(start = 4) %>%
as.integer() + 13
ggplot(long) +
aes(x = age, y = factor(ordre), fill = csp) +
geom_raster() +
theme_bw() +
scale_fill_brewer(palette = "Set3") +
facet_grid(rows = vars(classe), scales = "free_y", space = "free_y") +
scale_y_discrete(label = NULL) +
scale_x_continuous(
limits = c(14, 50),
breaks = c(14, 20, 25, 30, 35, 40, 45, 50)
)
## Warning: Removed 2000 rows containing missing values (geom_raster).
seqfplot(seq, group = seq.part, xtlab = 14:50)
seqmsplot(seq, group = seq.part, xtlab = 14:50)
seqmtplot(seq, group = seq.part, xtlab = 14:50)
## Warning in plot.window(xlim, ylim, log = log, ...): "xtlab" n'est pas un
## paramètre graphique
## Warning in axis(if (horiz) 2 else 1, at = at.l, labels = names.arg, lty =
## axis.lty, : "xtlab" n'est pas un paramètre graphique
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "xtlab"
## n'est pas un paramètre graphique
## Warning in plot.window(xlim, ylim, log = log, ...): "xtlab" n'est pas un
## paramètre graphique
## Warning in axis(if (horiz) 2 else 1, at = at.l, labels = names.arg, lty =
## axis.lty, : "xtlab" n'est pas un paramètre graphique
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "xtlab"
## n'est pas un paramètre graphique
## Warning in plot.window(xlim, ylim, log = log, ...): "xtlab" n'est pas un
## paramètre graphique
## Warning in axis(if (horiz) 2 else 1, at = at.l, labels = names.arg, lty =
## axis.lty, : "xtlab" n'est pas un paramètre graphique
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "xtlab"
## n'est pas un paramètre graphique
## Warning in plot.window(xlim, ylim, log = log, ...): "xtlab" n'est pas un
## paramètre graphique
## Warning in axis(if (horiz) 2 else 1, at = at.l, labels = names.arg, lty =
## axis.lty, : "xtlab" n'est pas un paramètre graphique
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "xtlab"
## n'est pas un paramètre graphique
## Warning in plot.window(xlim, ylim, log = log, ...): "xtlab" n'est pas un
## paramètre graphique
## Warning in axis(if (horiz) 2 else 1, at = at.l, labels = names.arg, lty =
## axis.lty, : "xtlab" n'est pas un paramètre graphique
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "xtlab"
## n'est pas un paramètre graphique
seqrplot(seq, group = seq.part, xtlab = 14:50, dist.matrix = seq.om, method = "dist")
## [>] number of objects (sum of weights): 366
## [>] max. distance: 74
## [>] neighborhood radius: 7.4
## [>] 4 representative(s) selected, coverage=28% (threshold=25%)
## [>] 343 distinct sequence(s)
## Warning in plot.window(xlim, ylim, log = log, ...): "method" n'est pas un
## paramètre graphique
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "method" n'est pas un paramètre graphique
## [>] number of objects (sum of weights): 183
## [>] max. distance: 74
## [>] neighborhood radius: 7.4
## [>] 1 representative(s) selected, coverage=57% (threshold=25%)
## [>] 121 distinct sequence(s)
## Warning in plot.window(xlim, ylim, log = log, ...): "method" n'est pas un
## paramètre graphique
## Warning in plot.window(xlim, ylim, log = log, ...): "method" n'est pas un
## paramètre graphique
## [>] number of objects (sum of weights): 104
## [>] max. distance: 74
## [>] neighborhood radius: 7.4
## [>] 3 representative(s) selected, coverage=30% (threshold=25%)
## [>] 99 distinct sequence(s)
## Warning in plot.window(xlim, ylim, log = log, ...): "method" n'est pas un
## paramètre graphique
## Warning in plot.window(xlim, ylim, log = log, ...): "method" n'est pas un
## paramètre graphique
## [>] number of objects (sum of weights): 296
## [>] max. distance: 74
## [>] neighborhood radius: 7.4
## [>] 1 representative(s) selected, coverage=48% (threshold=25%)
## [>] 204 distinct sequence(s)
## Warning in plot.window(xlim, ylim, log = log, ...): "method" n'est pas un
## paramètre graphique
## Warning in plot.window(xlim, ylim, log = log, ...): "method" n'est pas un
## paramètre graphique
## [>] number of objects (sum of weights): 51
## [>] max. distance: 74
## [>] neighborhood radius: 7.4
## [>] 6 representative(s) selected, coverage=27% (threshold=25%)
## [>] 51 distinct sequence(s)
## Warning in plot.window(xlim, ylim, log = log, ...): "method" n'est pas un
## paramètre graphique
## Warning in plot.window(xlim, ylim, log = log, ...): "method" n'est pas un
## paramètre graphique
seqHtplot(seq, group = seq.part, xtlab = 14:50)
library(gtsummary)
donnees %>%
select(generation, classe) %>%
tbl_summary(by = classe, percent = "row") %>%
add_p()
## Warning: The `.dots` argument of `group_by()` is deprecated as of dplyr 1.0.0.
Characteristic | Classe 1, N = 3661 | Classe 2, N = 1831 | Classe 3, N = 1041 | Classe 4, N = 2961 | Classe 5, N = 511 | p-value2 |
---|---|---|---|---|---|---|
generation | 0.018 | |||||
1930-1938 | 121 (36%) | 67 (20%) | 22 (6.5%) | 108 (32%) | 22 (6.5%) | |
1939-1945 | 96 (33%) | 54 (18%) | 41 (14%) | 86 (29%) | 18 (6.1%) | |
1946-1950 | 149 (41%) | 62 (17%) | 41 (11%) | 102 (28%) | 11 (3.0%) | |
1
n (%)
2
Pearson's Chi-squared test
|