Ugh, was für ein schreckliches Datenformat. Wir fügen eine ID-Spalte hinzu, bereinigen sie, und dann können wir in ein paar Zeilen bekommen, was Sie wollen.
library(tidyr)
library(dplyr)
library(stringr)
# create ID column
mutate(dd, id = 1:n()) %>%
# convert degree and year columns to long format
gather(dd, key = "degkey", value = "degree", starts_with("Deg")) %>%
gather(key = "yearkey", value = "year", starts_with("Year")) %>%
# pull the numbers into an index
mutate(yr_index = str_extract(yearkey, "[0-9]+"),
deg_index = str_extract(degkey, "[0-9]+")) %>%
# get rid of junk and filter to the years you want
filter(yr_index == deg_index, year < 2015) %>%
# order by descending index
arrange(desc(yr_index)) %>%
# keep relevant columns
select(id, degree, year) %>%
# for each ID, keep the top row
group_by(id) %>%
slice(1) %>%
# join back to the original to complete any lost IDs
right_join(select(dd, id))
# Joining, by = "id"
# # A tibble: 6 x 3
# # Groups: id [?]
# id degree year
# <int> <chr> <int>
# 1 1 Master 2004
# 2 2 PHD 2010
# 3 3 College 2006
# 4 4 <NA> NA
# 5 5 Master 2004
# 6 6 Master 2014
# Warning message:
# attributes are not identical across measure variables; they will be dropped
diese Daten verwenden:
dd = read.table(text = "Year1 Deg_Year1 Year2 Deg_Year2 Year3 Deg_Year3 Year4 Deg_Year4 Year5 Deg_Year5
2001 College 2004 Master NA NA NA NA NA NA
2004 College 2004 Master 2010 PHD NA NA NA NA
2006 Master 2006 College NA NA NA NA NA NA
2016 Master NA NA NA NA NA NA NA NA
2002 Master 2003 Master 2004 College 2004 Master NA NA
2014 Master 2017 PHD NA NA NA NA NA NA",
header = T)
Danke soviel !! –