2017-12-04 1 views
1

Ich habe folgenden Datenrahmen:Wie String als Teil dplyr extrahieren :: mutieren kochend


library(tidyverse) 

df <- structure(list(pfc_chr = c("chr1", "chr1", "chr1", "chr1", "chr1", 
"chr1", "chr1", "chr1", "chr1", "chr1"), pfc_chr_st = c(3046442L, 
3119671L, 3164756L, 3167322L,838L, 3212196L, 3249068L, 3268246L, 
3444892L, 3451544L), peak_name = c("XXX-ad_peak_1", "XXX-ad_peak_2a", 
"PMN_peak_2", "Ytb_peak_3", "PMN_peak_3", "XXX-ad_peak_6", 
"XXX-ad_peak_8", "PMN_peak_5", "XXX-ad_peak_11", "XXX-ad_peak_12" 
)), .Names = c("pfc_chr", "pfc_chr_st", "peak_name"), row.names = c(NA, 
-10L), class = c("tbl_df", "tbl", "data.frame")) 


df 
#> # A tibble: 10 x 3 
#> pfc_chr pfc_chr_st  peak_name 
#>  <chr>  <int>   <chr> 
#> 1 chr1 3046442 XXX-ad_peak_1 
#> 2 chr1 3119671 XXX-ad_peak_2a 
#> 3 chr1 3164756  PMN_peak_2 
#> 4 chr1 3167322  Ytb_peak_3 
#> 5 chr838  PMN_peak_3 
#> 6 chr1 3212196 XXX-ad_peak_6 
#> 7 chr1 3249068 XXX-ad_peak_8 
#> 8 chr1 3268246  PMN_peak_5 
#> 9 chr1 3444892 XXX-ad_peak_11 
#> 10 chr1 3451544 XXX-ad_peak_12 

Was ich tun möchte, ist den Teil in peak_name als Teil extrahieren von dplyr Rohr. Das endgültige gewünschte Ergebnis ist:

pfc_chr pfc_chr_st  peak_name  new_col 
1  chr1 3046442 XXX-ad_peak_1   XXX-ad 
2  chr1 3119671 XXX-ad_peak_2a   XXX-ad 
3  chr1 3164756  PMN_peak_2   PMN 
4  chr1 3167322  Ytb_peak_3   Ytb 
5  chr838  PMN_peak_3   PMN 
6  chr1 3212196 XXX-ad_peak_6   XXX-ad 
7  chr1 3249068 XXX-ad_peak_8   XXX-ad 
8  chr1 3268246  PMN_peak_5   PMN 
9  chr1 3444892 XXX-ad_peak_11   XXX-ad 
10 chr1 3451544 XXX-ad_peak_12   XXX-ad 

Ich habe versucht, dies aber nicht:

> df %>% mutate(new_col = stringr::str_match(peak_name, "^(.*?)\\_peak\\_*?")) 
Error in mutate_impl(.data, dots) : 
    Column `new_col` must be length 10 (the number of rows) or one, not 20 

Was ist der richtige Weg, es zu tun?

+2

Versuchen Sie 'sub ("^(. *?) _ Peak _. * "," \\ 1 ", peak_name)' ([regex demo] (https://regex101.com/r/yqKrml/1)) statt von 'stringr :: str_match (...)' –

+1

oder sogar 'sub (" _ peak. * $ "," ", peak_name)' – Hugh

Antwort

3

ich stringr::str_extract() mit einem Look-Ahead vorschlagen würde:

df %>% 
    mutate(new_col = stringr::str_extract(peak_name, "^.*(?=_peak)")) 

Die Ergebnisse sind unten dargestellt:

> df %>% 
+ mutate(new_col = stringr::str_extract(peak_name, "^.*(?=_peak)")) 
# A tibble: 10 x 4 
    pfc_chr pfc_chr_st  peak_name new_col 
    <chr>  <int>   <chr> <chr> 
1 chr1 3046442 XXX-ad_peak_1 XXX-ad 
2 chr1 3119671 XXX-ad_peak_2a XXX-ad 
3 chr1 3164756  PMN_peak_2  PMN 
4 chr1 3167322  Ytb_peak_3  Ytb 
5 chr838  PMN_peak_3  PMN 
6 chr1 3212196 XXX-ad_peak_6 XXX-ad 
7 chr1 3249068 XXX-ad_peak_8 XXX-ad 
8 chr1 3268246  PMN_peak_5  PMN 
9 chr1 3444892 XXX-ad_peak_11 XXX-ad 
10 chr1 3451544 XXX-ad_peak_12 XXX-ad 

Beachten Sie, dass Daten wie „_peak_8 "würde ein e mpty Zeichenfolge; Daten wie "peak_8" gibt NA zurück.

1

zweite Spalte auswählen.

df %>% mutate(new_col = stringr::str_match(peak_name, "^(.*?)\\_peak\\_*?")[, 2]) 

Ausgang

pfc_chr pfc_chr_st  peak_name new_col 
1 chr1 3046442 XXX-ad_peak_1 XXX-ad 
2 chr1 3119671 XXX-ad_peak_2a XXX-ad 
3 chr1 3164756  PMN_peak_2  PMN 
4 chr1 3167322  Ytb_peak_3  Ytb 
5 chr838  PMN_peak_3  PMN 
6 chr1 3212196 XXX-ad_peak_6 XXX-ad 
7 chr1 3249068 XXX-ad_peak_8 XXX-ad 
8 chr1 3268246  PMN_peak_5  PMN 
9 chr1 3444892 XXX-ad_peak_11 XXX-ad 
10 chr1 3451544 XXX-ad_peak_12 XXX-ad 
Verwandte Themen