Ich habe ein Datensatz von zwei Kurse in 2 verschiedenen Semestern, die die folgende Form annimmt:gruppenweise Zusammenfassungen/Subsets dplyr
set.seed(200)
sem <- sample(c("1", "2"), 200, replace = T)
course <- sample(c("1", "2"), 200, replace = T)
d.gender = sample(c(0, 1), 200, replace = T, prob = c(0.6, 0.4))
d.pass = sample(c(0, 1), 200, replace = T, prob = c(0.7, 0.3))
df <- data.frame(sem, course, d.gender, d.pass)
Ich bin zu versuchen effizient eine Tabl der 4 verschiedenen erstellen sem, Kurs Kombinationen zusammen mit ihrer gesamten Erfolgsquote, der Prozentsatz von d.gender = 1, und schließlich die Erfolgsquoten innerhalb dieser beiden Geschlechterkategorien. Ich kann eine Tabelle erstellen, die alle Werte enthält, die ich berechnen muss, aber ich weiß, dass es einen effizienteren Weg gibt, um zu berechnen, was ich brauche, ohne eine Menge verschiedener group_by- und summary-Funktionen zu verschachteln oder eine ganze Reihe von verschiedenen tbls zu machen Spalten, die ich will. Ich kann das bekommen, was ich mit Indexen und Subset-Funktionen loswerden muss, aber ich hoffe, dass es eine bessere Möglichkeit gibt, eine vierreihige Matrix mit allem zu bekommen, was ich brauche, aber es ist hässlich und dauert ewig, und es ist leicht, Fehler zu machen der Code:
df1 <- df %>% group_by(sem, course, d.gender, d.pass) %>% summarize(total = n())
df1$total_pass <- rep(NA, dim(df1)[1])
df1$total_pass[1:4] <- sum(subset(df1, sem == "1" & course == "1" & d.pass == "1",
select = total))
df1$total_pass[5:8] <- sum(subset(df1, sem == "1" & course == "2" & d.pass == "1",
select = total))
df1$total_pass[9:12] <- sum(subset(df1, sem == "2" & course == "1" & d.pass == "1",
select = total))
df1$total_pass[13:16] <- sum(subset(df1, sem == "2" & course == "2" & d.pass == "1",
select = total))
df1$n_male <- rep(NA, dim(df1)[1])
df1$n_male[1:4] <- sum(subset(df1, sem == "1" & course == "1" & d.gender == "1",
select = total))
df1$n_male[5:8] <- sum(subset(df1, sem == "1" & course == "2" & d.gender == "1",
select = total))
df1$n_male[9:12] <- sum(subset(df1, sem == "2" & course == "1" & d.gender == "1",
select = total))
df1$n_male[13:16] <- sum(subset(df1, sem == "2" & course == "2" & d.gender == "1",
select = total))
df1$n_fem <- rep(NA, dim(df1)[1])
df1$n_fem[1:4] <- sum(subset(df1, sem == "1" & course == "1" & d.gender == "0", select = total))
df1$n_fem[5:8] <- sum(subset(df1, sem == "1" & course == "2" & d.gender == "0", select = total))
df1$n_fem[9:12] <- sum(subset(df1, sem == "2" & course == "1" & d.gender == "0",
select = total))
df1$n_fem[13:16] <- sum(subset(df1, sem == "2" & course == "2" & d.gender == "0",
select = total))
df1$pct_male <- rep(NA, dim(df1)[1])
df1$pct_male[1:4] <- df1$n_male[1:4]/sum(subset(df1, sem == "1" & course == "1",
select = total))
df1$pct_male[5:8] <- df1$n_male[5:8]/sum(subset(df1, sem == "1" & course == "2",
select = total))
df1$pct_male[9:12] <- df1$n_male[9:12]/sum(subset(df1, sem == "2" & course == "1",
select = total))
df1$pct_male[13:16] <- df1$n_male[13:16]/sum(subset(df1, sem == "2" & course == "2",
select = total))
df1$pct_fem <- rep(NA, dim(df1)[1])
df1$pct_fem <- 1 - df1$pct_male
df1$pct_pass <- rep(NA, dim(df1)[1])
df1$pct_pass[1:4] <- df1$total_pass[1:4]/sum(subset(df1, sem == "1" & course == "1",
select = total))
df1$pct_pass[5:8] <- df1$total_pass[5:8]/sum(subset(df1, sem == "1" & course == "2",
select = total))
df1$pct_pass[9:12] <- df1$total_pass[9:12]/sum(subset(df1, sem == "2" & course ==
"1", select = total))
df1$pct_pass[13:16] <- df1$total_pass[13:16]/sum(subset(df1, sem == "2" & course ==
"2", select = total))
df1$male_pass_pct <- rep(NA, dim(df1)[1])
df1$male_pass_pct[1:4] <- subset(df1, sem == "1" & course == "1" & d.gender == "1" &
d.pass == "1", select = total)/df1$n_male[1:4]
df1$male_pass_pct[5:8] <- subset(df1, sem == "1" & course == "2" & d.gender == "1" &
d.pass == "1", select = total)/df1$n_male[5:8]
df1$male_pass_pct[9:12] <- subset(df1, sem == "2" & course == "1" & d.gender == "1" &
d.pass == "1", select = total)/df1$n_male[9:12]
df1$male_pass_pct[13:16] <- subset(df1, sem == "2" & course == "2" & d.gender ==
"1" & d.pass == "1", select = total)/df1$n_male[13:16]
df1$fem_pass_pct <- rep(NA, dim(df1)[1])
df1$fem_pass_pct[1:4] <- subset(df1, sem == "1" & course == "1" & d.gender == "0" &
d.pass == "1", select = total)/df1$n_fem[1:4]
df1$fem_pass_pct[5:8] <- subset(df1, sem == "1" & course == "2" & d.gender == "0" &
d.pass == "1", select = total)/df1$n_fem[5:8]
df1$fem_pass_pct[9:12] <- subset(df1, sem == "2" & course == "1" & d.gender == "0" &
d.pass == "1", select = total)/df1$n_fem[9:12]
df1$fem_pass_pct[13:16] <- subset(df1, sem == "2" & course == "2" & d.gender == "0" &
d.pass == "1", select = total)/df1$n_fem[13:16]
df2 <- df1 %>%
group_by(sem, course) %>%
summarize(total_pass = first(total_pass),
pct_pass = first(pct_pass),
n_male = first(n_male),
n_fem = first(n_fem),
pct_male = first(pct_male),
pct_fem = first(pct_fem),
male_pass_pct = first(male_pass_pct),
fem_pass_pct = first(fem_pass_pct))
df2 <- unique(df1[, c(1, 2, 6, 11, 7:10, 12, 13)])
df2[, c(9, 10)] <- lapply(df2[, c(9, 10)], as.numeric)
, die nur um Maßnahmen für 4 Reihen wirklich mühsam, aber ich kann es nicht anders für diese Aggregation zu arbeiten ... Jede Hilfe wäre
danke, danke. Ich wusste, dass es einen leichteren Weg gab. Ich wusste nicht, dass Sie logische Ausdrücke innerhalb der Zusammenfassungsfunktion verwenden könnten. Das macht die Dinge viel einfacher –