Mam listę ciągów, które mają cztery elementy:

a_b_c_d gdzie:

 • a ma 3 wzory ciągów: str, jtp i mdl
 • b ma 5 wzorów ciągów: HBW, HBS, HBO, NHBB i NHBO
 • c ma 4 wzory łańcucha: L, M, H i ALL
 • d ma 4 wzory ciągów: NMT, MC, CAR i PT
 • a, c, c i d są podłączone według _ w porządku
 • Niektóre rekordy nie mają b, c i d

Muszę wyodrębnić c z list jako nowego pola income. Jeśli c nie istnieje, należy go zastąpić NA. Poniżej znajduje się rzeczywista ramka danych, której używam:

df <- c(
"str_HBW_L_NMT" ,"str_HBW_M_NMT" ,"str_HBW_H_NMT" ,"str_HBW_L_MC" ,"str_HBW_M_MC" ,"str_HBW_H_MC" ,
"str_HBW_L_CAR" ,"str_HBW_M_CAR" ,"str_HBW_H_CAR" ,"str_HBW_L_PT" ,"str_HBW_M_PT" ,"str_HBW_H_PT" ,
"str_HBS_L_NMT" ,"str_HBS_M_NMT" ,"str_HBS_H_NMT" ,"str_HBS_L_MC" ,"str_HBS_M_MC" ,"str_HBS_H_MC" ,
"str_HBS_L_CAR" ,"str_HBS_M_CAR" ,"str_HBS_H_CAR" ,"str_HBS_L_PT" ,"str_HBS_M_PT" ,"str_HBS_H_PT" ,
"str_HBO_L_NMT" ,"str_HBO_M_NMT" ,"str_HBO_H_NMT" ,"str_HBO_L_MC" ,"str_HBO_M_MC" ,"str_HBO_H_MC" ,
"str_HBO_L_CAR" ,"str_HBO_M_CAR" ,"str_HBO_H_CAR" ,"str_HBO_L_PT" ,"str_HBO_M_PT" ,"str_HBO_H_PT" ,
"str_NHBB_L_NMT","str_NHBB_M_NMT","str_NHBB_H_NMT","str_NHBB_L_MC","str_NHBB_M_MC","str_NHBB_H_MC",
"str_NHBB_L_CAR","str_NHBB_M_CAR","str_NHBB_H_CAR","str_NHBB_L_PT","str_NHBB_M_PT","str_NHBB_H_PT",
"str_NHBO_L_NMT","str_NHBO_M_NMT","str_NHBO_H_NMT","str_NHBO_L_MC","str_NHBO_M_MC","str_NHBO_H_MC",
"str_NHBO_L_CAR","str_NHBO_M_CAR","str_NHBO_H_CAR","str_NHBO_L_PT","str_NHBO_M_PT","str_NHBO_H_PT",
"str_HBW_L"   ,"str_HBW_M"   ,"str_HBW_H"   ,"str_HBS_L"  ,"str_HBS_M"  ,"str_HBS_H"  ,
"str_HBO_L"   ,"str_HBO_M"   ,"str_HBO_H"   ,"str_NHBB_L"  ,"str_NHBB_M"  ,"str_NHBB_H"  ,
"str_NHBO_L"  ,"str_NHBO_M"  ,"str_NHBO_H"  ,"str_HBW"   ,"str_HBS"   ,"str_HBO"   ,
"str_NHBB"   ,"str_NHBO"   ,"str_L"     ,"str_M"    ,"str_H"    ,"str_ALL"   ,
"jtp_HBW_L_NMT" ,"jtp_HBW_M_NMT" ,"jtp_HBW_H_NMT" ,"jtp_HBW_L_MC" ,"jtp_HBW_M_MC" ,"jtp_HBW_H_MC" ,
"jtp_HBW_L_CAR" ,"jtp_HBW_M_CAR" ,"jtp_HBW_H_CAR" ,"jtp_HBW_L_PT" ,"jtp_HBW_M_PT" ,"jtp_HBW_H_PT" ,
"jtp_HBS_L_NMT" ,"jtp_HBS_M_NMT" ,"jtp_HBS_H_NMT" ,"jtp_HBS_L_MC" ,"jtp_HBS_M_MC" ,"jtp_HBS_H_MC" ,
"jtp_HBS_L_CAR" ,"jtp_HBS_M_CAR" ,"jtp_HBS_H_CAR" ,"jtp_HBS_L_PT" ,"jtp_HBS_M_PT" ,"jtp_HBS_H_PT" ,
"jtp_HBW_L"   ,"jtp_HBW_M"   ,"jtp_HBW_H"   ,"jtp_HBS_L"  ,"jtp_HBS_M"  ,"jtp_HBS_H"  ,
"jtp_HBW"    ,"jtp_HBS"    ,"jtp_L"     ,"jtp_M"    ,"jtp_H"    ,"jtp_ALL"   ,
"mdl_HBW_L_NMT" ,"mdl_HBW_M_NMT" ,"mdl_HBW_H_NMT" ,"mdl_HBW_L_MC" ,"mdl_HBW_M_MC" ,"mdl_HBW_H_MC" ,
"mdl_HBW_L_CAR" ,"mdl_HBW_M_CAR" ,"mdl_HBW_H_CAR" ,"mdl_HBW_L_PT" ,"mdl_HBW_M_PT" ,"mdl_HBW_H_PT" ,
"mdl_HBS_L_NMT" ,"mdl_HBS_M_NMT" ,"mdl_HBS_H_NMT" ,"mdl_HBS_L_MC" ,"mdl_HBS_M_MC" ,"mdl_HBS_H_MC" ,
"mdl_HBS_L_CAR" ,"mdl_HBS_M_CAR" ,"mdl_HBS_H_CAR" ,"mdl_HBS_L_PT" ,"mdl_HBS_M_PT" ,"mdl_HBS_H_PT" ,
"mdl_HBO_L_NMT" ,"mdl_HBO_M_NMT" ,"mdl_HBO_H_NMT" ,"mdl_HBO_L_MC" ,"mdl_HBO_M_MC" ,"mdl_HBO_H_MC" ,
"mdl_HBO_L_CAR" ,"mdl_HBO_M_CAR" ,"mdl_HBO_H_CAR" ,"mdl_HBO_L_PT" ,"mdl_HBO_M_PT" ,"mdl_HBO_H_PT" ,
"mdl_NHBB_L_NMT","mdl_NHBB_M_NMT","mdl_NHBB_H_NMT","mdl_NHBB_L_MC","mdl_NHBB_M_MC","mdl_NHBB_H_MC",
"mdl_NHBB_L_CAR","mdl_NHBB_M_CAR","mdl_NHBB_H_CAR","mdl_NHBB_L_PT","mdl_NHBB_M_PT","mdl_NHBB_H_PT",
"mdl_NHBO_L_NMT","mdl_NHBO_M_NMT","mdl_NHBO_H_NMT","mdl_NHBO_L_MC","mdl_NHBO_M_MC","mdl_NHBO_H_MC",
"mdl_NHBO_L_CAR","mdl_NHBO_M_CAR","mdl_NHBO_H_CAR","mdl_NHBO_L_PT","mdl_NHBO_M_PT","mdl_NHBO_H_PT",
"mdl_HBW_L"   ,"mdl_HBW_M"   ,"mdl_HBW_H"   ,"mdl_HBS_L"  ,"mdl_HBS_M"  ,"mdl_HBS_H"  ,
"mdl_HBO_L"   ,"mdl_HBO_M"   ,"mdl_HBO_H"   ,"mdl_NHBB_L"  ,"mdl_NHBB_M"  ,"mdl_NHBB_H"  ,
"mdl_NHBO_L"  ,"mdl_NHBO_M"  ,"mdl_NHBO_H"  ,"mdl_HBW"   ,"mdl_HBS"   ,"mdl_HBO"   ,
"mdl_NHBB"   ,"mdl_NHBO"   ,"mdl_L"     ,"mdl_M"    ,"mdl_H"    ,"mdl_ALL"
)

Zrobiłem wiele prób, ale nie mógł go prawidłowo wydobyć. Poniżej znajduje się przykład mojego skryptu:

df %>% mutate(income=str_extract_all(string=name,
    pattern="(?!str|jtp|mdl|HBW|HBS|HBO|NHBB|NHBO|_)[L|M|H|(ALL)](?!NMT|MC|CAR|PT|_)"))

Czy masz jakieś sugestię, aby wyodrębnić żądane wyjście jako pokaż poniżej? Wolę używać tidyverse i stringr, ale base funkcja ma również zastosowanie zamiast stringr.

   name     income
1  str_HBW_L_NMT L
2  str_HBW_M_NMT M
3  str_HBW_H_NMT H
4   str_HBW_L_MC L
5   str_HBW_M_MC M
6   str_HBW_H_MC H
7  str_HBW_L_CAR L
8  str_HBW_M_CAR M
9  str_HBW_H_CAR H
10  str_HBW_L_PT L
11  str_HBW_M_PT M
12  str_HBW_H_PT H
13  str_HBS_L_NMT L
14  str_HBS_M_NMT M
15  str_HBS_H_NMT H
16  str_HBS_L_MC L
17  str_HBS_M_MC M
18  str_HBS_H_MC H
19  str_HBS_L_CAR L
20  str_HBS_M_CAR M
21  str_HBS_H_CAR H
22  str_HBS_L_PT L
23  str_HBS_M_PT M
24  str_HBS_H_PT H
25  str_HBO_L_NMT L
26  str_HBO_M_NMT M
27  str_HBO_H_NMT H
28  str_HBO_L_MC L
29  str_HBO_M_MC M
30  str_HBO_H_MC H
31  str_HBO_L_CAR L
32  str_HBO_M_CAR M
33  str_HBO_H_CAR H
34  str_HBO_L_PT L
35  str_HBO_M_PT M
36  str_HBO_H_PT H
37 str_NHBB_L_NMT L
38 str_NHBB_M_NMT M
39 str_NHBB_H_NMT H
40  str_NHBB_L_MC L
41  str_NHBB_M_MC M
42  str_NHBB_H_MC H
43 str_NHBB_L_CAR L
44 str_NHBB_M_CAR M
45 str_NHBB_H_CAR H
46  str_NHBB_L_PT L
47  str_NHBB_M_PT M
48  str_NHBB_H_PT H
49 str_NHBO_L_NMT L
50 str_NHBO_M_NMT M
51 str_NHBO_H_NMT H
52  str_NHBO_L_MC L
53  str_NHBO_M_MC M
54  str_NHBO_H_MC H
55 str_NHBO_L_CAR L
56 str_NHBO_M_CAR M
57 str_NHBO_H_CAR H
58  str_NHBO_L_PT L
59  str_NHBO_M_PT M
60  str_NHBO_H_PT H
61    str_HBW_L L
62    str_HBW_M M
63    str_HBW_H H
64    str_HBS_L L
65    str_HBS_M M
66    str_HBS_H H
67    str_HBO_L L
68    str_HBO_M M
69    str_HBO_H H
70   str_NHBB_L L
71   str_NHBB_M M
72   str_NHBB_H H
73   str_NHBO_L L
74   str_NHBO_M M
75   str_NHBO_H H
76     str_HBW <N/A>
77     str_HBS <N/A>
78     str_HBO <N/A>
79    str_NHBB <N/A>
80    str_NHBO <N/A>
81      str_L L
82      str_M M
83      str_H H
84     str_ALL ALL
85  jtp_HBW_L_NMT L
86  jtp_HBW_M_NMT M
87  jtp_HBW_H_NMT H
88  jtp_HBW_L_MC L
89  jtp_HBW_M_MC M
90  jtp_HBW_H_MC H
91  jtp_HBW_L_CAR L
92  jtp_HBW_M_CAR M
93  jtp_HBW_H_CAR H
94  jtp_HBW_L_PT L
95  jtp_HBW_M_PT M
96  jtp_HBW_H_PT H
97  jtp_HBS_L_NMT L
98  jtp_HBS_M_NMT M
99  jtp_HBS_H_NMT H
100  jtp_HBS_L_MC L
101  jtp_HBS_M_MC M
102  jtp_HBS_H_MC H
103 jtp_HBS_L_CAR L
104 jtp_HBS_M_CAR M
105 jtp_HBS_H_CAR H
106  jtp_HBS_L_PT L
107  jtp_HBS_M_PT M
108  jtp_HBS_H_PT H
109   jtp_HBW_L L
110   jtp_HBW_M M
111   jtp_HBW_H H
112   jtp_HBS_L L
113   jtp_HBS_M M
114   jtp_HBS_H H
115    jtp_HBW <N/A>
116    jtp_HBS <N/A>
117     jtp_L L
118     jtp_M M
119     jtp_H H
120    jtp_ALL ALL
121 mdl_HBW_L_NMT L
122 mdl_HBW_M_NMT M
123 mdl_HBW_H_NMT H
124  mdl_HBW_L_MC L
125  mdl_HBW_M_MC M
126  mdl_HBW_H_MC H
127 mdl_HBW_L_CAR L
128 mdl_HBW_M_CAR M
129 mdl_HBW_H_CAR H
130  mdl_HBW_L_PT L
131  mdl_HBW_M_PT M
132  mdl_HBW_H_PT H
133 mdl_HBS_L_NMT L
134 mdl_HBS_M_NMT M
135 mdl_HBS_H_NMT H
136  mdl_HBS_L_MC L
137  mdl_HBS_M_MC M
138  mdl_HBS_H_MC H
139 mdl_HBS_L_CAR L
140 mdl_HBS_M_CAR M
141 mdl_HBS_H_CAR H
142  mdl_HBS_L_PT L
143  mdl_HBS_M_PT M
144  mdl_HBS_H_PT H
145 mdl_HBO_L_NMT L
146 mdl_HBO_M_NMT M
147 mdl_HBO_H_NMT H
148  mdl_HBO_L_MC L
149  mdl_HBO_M_MC M
150  mdl_HBO_H_MC H
151 mdl_HBO_L_CAR L
152 mdl_HBO_M_CAR M
153 mdl_HBO_H_CAR H
154  mdl_HBO_L_PT L
155  mdl_HBO_M_PT M
156  mdl_HBO_H_PT H
157 mdl_NHBB_L_NMT L
158 mdl_NHBB_M_NMT M
159 mdl_NHBB_H_NMT H
160 mdl_NHBB_L_MC L
161 mdl_NHBB_M_MC M
162 mdl_NHBB_H_MC H
163 mdl_NHBB_L_CAR L
164 mdl_NHBB_M_CAR M
165 mdl_NHBB_H_CAR H
166 mdl_NHBB_L_PT L
167 mdl_NHBB_M_PT M
168 mdl_NHBB_H_PT H
169 mdl_NHBO_L_NMT L
170 mdl_NHBO_M_NMT M
171 mdl_NHBO_H_NMT H
172 mdl_NHBO_L_MC L
173 mdl_NHBO_M_MC M
174 mdl_NHBO_H_MC H
175 mdl_NHBO_L_CAR L
176 mdl_NHBO_M_CAR M
177 mdl_NHBO_H_CAR H
178 mdl_NHBO_L_PT L
179 mdl_NHBO_M_PT M
180 mdl_NHBO_H_PT H
181   mdl_HBW_L L
182   mdl_HBW_M M
183   mdl_HBW_H H
184   mdl_HBS_L L
185   mdl_HBS_M M
186   mdl_HBS_H H
187   mdl_HBO_L L
188   mdl_HBO_M M
189   mdl_HBO_H H
190   mdl_NHBB_L L
191   mdl_NHBB_M M
192   mdl_NHBB_H H
193   mdl_NHBO_L L
194   mdl_NHBO_M M
195   mdl_NHBO_H H
196    mdl_HBW <N/A>
197    mdl_HBS <N/A>
198    mdl_HBO <N/A>
199    mdl_NHBB <N/A>
200    mdl_NHBO <N/A>
201     mdl_L L
202     mdl_M M
203     mdl_H H
204    mdl_ALL ALL

============== Nowy przykład ramy danych ========== Rekordy zawierające tylko c lub d zostały dodane na górze oryginalnego DF.

df <- c(
"NMT","MC","CAR","PT","L","M","H","ALL",
"str_HBW_L_NMT" ,"str_HBW_M_NMT" ,"str_HBW_H_NMT" ,"str_HBW_L_MC" ,"str_HBW_M_MC" ,"str_HBW_H_MC" ,
"str_HBW_L_CAR" ,"str_HBW_M_CAR" ,"str_HBW_H_CAR" ,"str_HBW_L_PT" ,"str_HBW_M_PT" ,"str_HBW_H_PT" ,
"str_HBS_L_NMT" ,"str_HBS_M_NMT" ,"str_HBS_H_NMT" ,"str_HBS_L_MC" ,"str_HBS_M_MC" ,"str_HBS_H_MC" ,
"str_HBS_L_CAR" ,"str_HBS_M_CAR" ,"str_HBS_H_CAR" ,"str_HBS_L_PT" ,"str_HBS_M_PT" ,"str_HBS_H_PT" ,
"str_HBO_L_NMT" ,"str_HBO_M_NMT" ,"str_HBO_H_NMT" ,"str_HBO_L_MC" ,"str_HBO_M_MC" ,"str_HBO_H_MC" ,
"str_HBO_L_CAR" ,"str_HBO_M_CAR" ,"str_HBO_H_CAR" ,"str_HBO_L_PT" ,"str_HBO_M_PT" ,"str_HBO_H_PT" ,
"str_NHBB_L_NMT","str_NHBB_M_NMT","str_NHBB_H_NMT","str_NHBB_L_MC","str_NHBB_M_MC","str_NHBB_H_MC",
"str_NHBB_L_CAR","str_NHBB_M_CAR","str_NHBB_H_CAR","str_NHBB_L_PT","str_NHBB_M_PT","str_NHBB_H_PT",
"str_NHBO_L_NMT","str_NHBO_M_NMT","str_NHBO_H_NMT","str_NHBO_L_MC","str_NHBO_M_MC","str_NHBO_H_MC",
"str_NHBO_L_CAR","str_NHBO_M_CAR","str_NHBO_H_CAR","str_NHBO_L_PT","str_NHBO_M_PT","str_NHBO_H_PT",
"str_HBW_L"   ,"str_HBW_M"   ,"str_HBW_H"   ,"str_HBS_L"  ,"str_HBS_M"  ,"str_HBS_H"  ,
"str_HBO_L"   ,"str_HBO_M"   ,"str_HBO_H"   ,"str_NHBB_L"  ,"str_NHBB_M"  ,"str_NHBB_H"  ,
"str_NHBO_L"  ,"str_NHBO_M"  ,"str_NHBO_H"  ,"str_HBW"   ,"str_HBS"   ,"str_HBO"   ,
"str_NHBB"   ,"str_NHBO"   ,"str_L"     ,"str_M"    ,"str_H"    ,"str_ALL"   ,
"jtp_HBW_L_NMT" ,"jtp_HBW_M_NMT" ,"jtp_HBW_H_NMT" ,"jtp_HBW_L_MC" ,"jtp_HBW_M_MC" ,"jtp_HBW_H_MC" ,
"jtp_HBW_L_CAR" ,"jtp_HBW_M_CAR" ,"jtp_HBW_H_CAR" ,"jtp_HBW_L_PT" ,"jtp_HBW_M_PT" ,"jtp_HBW_H_PT" ,
"jtp_HBS_L_NMT" ,"jtp_HBS_M_NMT" ,"jtp_HBS_H_NMT" ,"jtp_HBS_L_MC" ,"jtp_HBS_M_MC" ,"jtp_HBS_H_MC" ,
"jtp_HBS_L_CAR" ,"jtp_HBS_M_CAR" ,"jtp_HBS_H_CAR" ,"jtp_HBS_L_PT" ,"jtp_HBS_M_PT" ,"jtp_HBS_H_PT" ,
"jtp_HBW_L"   ,"jtp_HBW_M"   ,"jtp_HBW_H"   ,"jtp_HBS_L"  ,"jtp_HBS_M"  ,"jtp_HBS_H"  ,
"jtp_HBW"    ,"jtp_HBS"    ,"jtp_L"     ,"jtp_M"    ,"jtp_H"    ,"jtp_ALL"   ,
"mdl_HBW_L_NMT" ,"mdl_HBW_M_NMT" ,"mdl_HBW_H_NMT" ,"mdl_HBW_L_MC" ,"mdl_HBW_M_MC" ,"mdl_HBW_H_MC" ,
"mdl_HBW_L_CAR" ,"mdl_HBW_M_CAR" ,"mdl_HBW_H_CAR" ,"mdl_HBW_L_PT" ,"mdl_HBW_M_PT" ,"mdl_HBW_H_PT" ,
"mdl_HBS_L_NMT" ,"mdl_HBS_M_NMT" ,"mdl_HBS_H_NMT" ,"mdl_HBS_L_MC" ,"mdl_HBS_M_MC" ,"mdl_HBS_H_MC" ,
"mdl_HBS_L_CAR" ,"mdl_HBS_M_CAR" ,"mdl_HBS_H_CAR" ,"mdl_HBS_L_PT" ,"mdl_HBS_M_PT" ,"mdl_HBS_H_PT" ,
"mdl_HBO_L_NMT" ,"mdl_HBO_M_NMT" ,"mdl_HBO_H_NMT" ,"mdl_HBO_L_MC" ,"mdl_HBO_M_MC" ,"mdl_HBO_H_MC" ,
"mdl_HBO_L_CAR" ,"mdl_HBO_M_CAR" ,"mdl_HBO_H_CAR" ,"mdl_HBO_L_PT" ,"mdl_HBO_M_PT" ,"mdl_HBO_H_PT" ,
"mdl_NHBB_L_NMT","mdl_NHBB_M_NMT","mdl_NHBB_H_NMT","mdl_NHBB_L_MC","mdl_NHBB_M_MC","mdl_NHBB_H_MC",
"mdl_NHBB_L_CAR","mdl_NHBB_M_CAR","mdl_NHBB_H_CAR","mdl_NHBB_L_PT","mdl_NHBB_M_PT","mdl_NHBB_H_PT",
"mdl_NHBO_L_NMT","mdl_NHBO_M_NMT","mdl_NHBO_H_NMT","mdl_NHBO_L_MC","mdl_NHBO_M_MC","mdl_NHBO_H_MC",
"mdl_NHBO_L_CAR","mdl_NHBO_M_CAR","mdl_NHBO_H_CAR","mdl_NHBO_L_PT","mdl_NHBO_M_PT","mdl_NHBO_H_PT",
"mdl_HBW_L"   ,"mdl_HBW_M"   ,"mdl_HBW_H"   ,"mdl_HBS_L"  ,"mdl_HBS_M"  ,"mdl_HBS_H"  ,
"mdl_HBO_L"   ,"mdl_HBO_M"   ,"mdl_HBO_H"   ,"mdl_NHBB_L"  ,"mdl_NHBB_M"  ,"mdl_NHBB_H"  ,
"mdl_NHBO_L"  ,"mdl_NHBO_M"  ,"mdl_NHBO_H"  ,"mdl_HBW"   ,"mdl_HBS"   ,"mdl_HBO"   ,
"mdl_NHBB"   ,"mdl_NHBO"   ,"mdl_L"     ,"mdl_M"    ,"mdl_H"    ,"mdl_ALL"
1
Hideo.S 4 czerwiec 2018, 11:58

3 odpowiedzi

Najlepsza odpowiedź

Możesz utworzyć regex, który będzie pasował do wszystkich swoich wejść za pomocą opcjonalnych grupami dla b, c i d i d, usuń wszystkie oprócz części c z ciągu, a następnie zastąp pusty przedmioty z NA s:

res <- sub("^(?:(?:str|jtp|mdl)(?:_|$))?(?:(?:HB[WSO]|NHB[BO])(?:_|$))?(?:([LMH]|ALL)(?:_|$))?(?:NMT|MC|CAR|PT)?$", "\\1", df)
res[nchar(res)==0] <- NA
> res
 [1] NA  NA  NA  NA  "L"  "M"  "H"  "ALL" "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M" 
 [29] "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H" 
 [57] "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  NA  
 [85] NA  NA  NA  NA  "L"  "M"  "H"  "ALL" "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M" 
[113] "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  NA  NA  "L"  "M"  "H"  "ALL" "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H" 
[141] "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L" 
[169] "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M"  "H"  "L"  "M" 
[197] "H"  "L"  "M"  "H"  "L"  "M"  "H"  NA  NA  NA  NA  NA  "L"  "M"  "H"  "ALL"

Zobacz REGEX DEMO.

Jak widać, nie potrzebujesz tutaj żadnego dodatkowego pakietu.

Szczegóły wzoru

 • ^ - początek ciągu
 • (?:(?:str|jtp|mdl)(?:_|$))? - a część (opcjonalna grupa): str, jtp lub mdl a następnie mdl lub koniec ciągów ({{x6 }})
 • (?:(?:HB[WSO]|NHB[BO])(?:_|$))? - część b (a opcjonalna grupa): Opcjonalna niepopłatna grupowa grupowa _, a następnie HBW, HBS, {x5}} , NHBB lub NHBO a następnie _ lub koniec łańcucha ((?:_|$))
 • (?:_([LMH]|ALL))? - Część c Część: Opcjonalna niepokój pasuje do grupy _, a następnie L, M, H lub {{x6} lub ALL } Następnie za pomocą _ lub koniec ciągu ((?:_|$))
 • (?:NMT|MC|CAR|PT)? - A d Część: Opcjonalna niezapruchająca się pasuje do grupy NMT, MC, CAR lub PT
 • $ - koniec ciągu.

\\1 (String {X1}}) Zastępca w wzorze zamiennym przywraca grupę 1 (c wzór).

2
Wiktor Stribiżew 11 czerwiec 2018, 08:55

Dokonałbym te dane w dataframe, dzieląc łańcuch do kolumn. To znacznie łatwiej pracować.

require(stringr)

l <- str_split(df, pattern = "_") 

df2 <- as.data.frame(do.call(rbind, lapply(l, function(x) {length(x) <- 4
x})))

STR (DF2)

Prowadzi do:

'data.frame':  204 obs. of 4 variables:
$ V1: Factor w/ 3 levels "jtp","mdl","str": 3 3 3 3 3 3 3 3 3 3 ...
$ V2: Factor w/ 9 levels "ALL","H","HBO",..: 5 5 5 5 5 5 5 5 5 5 ...
$ V3: Factor w/ 3 levels "H","L","M": 2 3 1 2 3 1 2 3 1 2 ...
$ V4: Factor w/ 4 levels "CAR","MC","NMT",..: 3 3 3 2 2 2 1 1 1 4 ...
0
Wietze314 4 czerwiec 2018, 09:11

Nieco prostsze wyrażenia regularne

library(stringr)
library(dplyr)

# Create pattern to look for
patt <- paste0("_", c("L", "M", "H", "ALL"), c(rep("$", 4), rep("_", 4)))
patt <- paste(patt, collapse = "|")
patt
[1] "_L$|_M$|_H$|_ALL$|_L_|_M_|_H_|_ALL_"

# Extract pattern and clean
df$income <- str_extract(df$name, patt) %>% 
 str_replace_all("_", "")

head(df)
     name income
str_HBW_L_NMT   L
str_HBW_M_NMT   M
str_HBW_H_NMT   H
 str_HBW_L_MC   L
 str_HBW_M_MC   M
 str_HBW_H_MC   H

Lub Unikaj REGEX / DPLYR / Stringr całkowicie :

df$income <- 
 strsplit(df$name, split = "_", fixed = TRUE) %>% 
 lapply(
  function(x) {
   our_el <- x %in% c("L", "M", "H", "ALL")
   ifelse(!any(our_el), NA, x[our_el])
  }
 )
0
sindri_baldur 4 czerwiec 2018, 11:41