Mam listę ciągów, które mają cztery elementy:

a_b_c_d gdzie:

  • a ma 3 wzory ciągów: str, jtp i mdl
  • b ma 5 wzorów ciągów: HBW, HBS, HBO, NHBB i NHBO
  • c ma 4 wzory łańcucha: L, M, H i ALL
  • d ma 4 wzory ciągów: NMT, MC, CAR i PT
  • a, c, c i d są podłączone według _ w porządku
  • Niektóre rekordy nie mają b, c i d

Muszę wyodrębnić c z list jako nowego pola income. Jeśli c nie istnieje, należy go zastąpić NA. Poniżej znajduje się rzeczywista ramka danych, której używam:

df <- c(
"str_HBW_L_NMT" ,"str_HBW_M_NMT" ,"str_HBW_H_NMT" ,"str_HBW_L_MC" ,"str_HBW_M_MC" ,"str_HBW_H_MC" ,
"str_HBW_L_CAR" ,"str_HBW_M_CAR" ,"str_HBW_H_CAR" ,"str_HBW_L_PT" ,"str_HBW_M_PT" ,"str_HBW_H_PT" ,
"str_HBS_L_NMT" ,"str_HBS_M_NMT" ,"str_HBS_H_NMT" ,"str_HBS_L_MC" ,"str_HBS_M_MC" ,"str_HBS_H_MC" ,
"str_HBS_L_CAR" ,"str_HBS_M_CAR" ,"str_HBS_H_CAR" ,"str_HBS_L_PT" ,"str_HBS_M_PT" ,"str_HBS_H_PT" ,
"str_HBO_L_NMT" ,"str_HBO_M_NMT" ,"str_HBO_H_NMT" ,"str_HBO_L_MC" ,"str_HBO_M_MC" ,"str_HBO_H_MC" ,
"str_HBO_L_CAR" ,"str_HBO_M_CAR" ,"str_HBO_H_CAR" ,"str_HBO_L_PT" ,"str_HBO_M_PT" ,"str_HBO_H_PT" ,
"str_NHBB_L_NMT","str_NHBB_M_NMT","str_NHBB_H_NMT","str_NHBB_L_MC","str_NHBB_M_MC","str_NHBB_H_MC",
"str_NHBB_L_CAR","str_NHBB_M_CAR","str_NHBB_H_CAR","str_NHBB_L_PT","str_NHBB_M_PT","str_NHBB_H_PT",
"str_NHBO_L_NMT","str_NHBO_M_NMT","str_NHBO_H_NMT","str_NHBO_L_MC","str_NHBO_M_MC","str_NHBO_H_MC",
"str_NHBO_L_CAR","str_NHBO_M_CAR","str_NHBO_H_CAR","str_NHBO_L_PT","str_NHBO_M_PT","str_NHBO_H_PT",
"str_HBW_L"     ,"str_HBW_M"     ,"str_HBW_H"     ,"str_HBS_L"    ,"str_HBS_M"    ,"str_HBS_H"    ,
"str_HBO_L"     ,"str_HBO_M"     ,"str_HBO_H"     ,"str_NHBB_L"   ,"str_NHBB_M"   ,"str_NHBB_H"   ,
"str_NHBO_L"    ,"str_NHBO_M"    ,"str_NHBO_H"    ,"str_HBW"      ,"str_HBS"      ,"str_HBO"      ,
"str_NHBB"      ,"str_NHBO"      ,"str_L"         ,"str_M"        ,"str_H"        ,"str_ALL"      ,
"jtp_HBW_L_NMT" ,"jtp_HBW_M_NMT" ,"jtp_HBW_H_NMT" ,"jtp_HBW_L_MC" ,"jtp_HBW_M_MC" ,"jtp_HBW_H_MC" ,
"jtp_HBW_L_CAR" ,"jtp_HBW_M_CAR" ,"jtp_HBW_H_CAR" ,"jtp_HBW_L_PT" ,"jtp_HBW_M_PT" ,"jtp_HBW_H_PT" ,
"jtp_HBS_L_NMT" ,"jtp_HBS_M_NMT" ,"jtp_HBS_H_NMT" ,"jtp_HBS_L_MC" ,"jtp_HBS_M_MC" ,"jtp_HBS_H_MC" ,
"jtp_HBS_L_CAR" ,"jtp_HBS_M_CAR" ,"jtp_HBS_H_CAR" ,"jtp_HBS_L_PT" ,"jtp_HBS_M_PT" ,"jtp_HBS_H_PT" ,
"jtp_HBW_L"     ,"jtp_HBW_M"     ,"jtp_HBW_H"     ,"jtp_HBS_L"    ,"jtp_HBS_M"    ,"jtp_HBS_H"    ,
"jtp_HBW"       ,"jtp_HBS"       ,"jtp_L"         ,"jtp_M"        ,"jtp_H"        ,"jtp_ALL"      ,
"mdl_HBW_L_NMT" ,"mdl_HBW_M_NMT" ,"mdl_HBW_H_NMT" ,"mdl_HBW_L_MC" ,"mdl_HBW_M_MC" ,"mdl_HBW_H_MC" ,
"mdl_HBW_L_CAR" ,"mdl_HBW_M_CAR" ,"mdl_HBW_H_CAR" ,"mdl_HBW_L_PT" ,"mdl_HBW_M_PT" ,"mdl_HBW_H_PT" ,
"mdl_HBS_L_NMT" ,"mdl_HBS_M_NMT" ,"mdl_HBS_H_NMT" ,"mdl_HBS_L_MC" ,"mdl_HBS_M_MC" ,"mdl_HBS_H_MC" ,
"mdl_HBS_L_CAR" ,"mdl_HBS_M_CAR" ,"mdl_HBS_H_CAR" ,"mdl_HBS_L_PT" ,"mdl_HBS_M_PT" ,"mdl_HBS_H_PT" ,
"mdl_HBO_L_NMT" ,"mdl_HBO_M_NMT" ,"mdl_HBO_H_NMT" ,"mdl_HBO_L_MC" ,"mdl_HBO_M_MC" ,"mdl_HBO_H_MC" ,
"mdl_HBO_L_CAR" ,"mdl_HBO_M_CAR" ,"mdl_HBO_H_CAR" ,"mdl_HBO_L_PT" ,"mdl_HBO_M_PT" ,"mdl_HBO_H_PT" ,
"mdl_NHBB_L_NMT","mdl_NHBB_M_NMT","mdl_NHBB_H_NMT","mdl_NHBB_L_MC","mdl_NHBB_M_MC","mdl_NHBB_H_MC",
"mdl_NHBB_L_CAR","mdl_NHBB_M_CAR","mdl_NHBB_H_CAR","mdl_NHBB_L_PT","mdl_NHBB_M_PT","mdl_NHBB_H_PT",
"mdl_NHBO_L_NMT","mdl_NHBO_M_NMT","mdl_NHBO_H_NMT","mdl_NHBO_L_MC","mdl_NHBO_M_MC","mdl_NHBO_H_MC",
"mdl_NHBO_L_CAR","mdl_NHBO_M_CAR","mdl_NHBO_H_CAR","mdl_NHBO_L_PT","mdl_NHBO_M_PT","mdl_NHBO_H_PT",
"mdl_HBW_L"     ,"mdl_HBW_M"     ,"mdl_HBW_H"     ,"mdl_HBS_L"    ,"mdl_HBS_M"    ,"mdl_HBS_H"    ,
"mdl_HBO_L"     ,"mdl_HBO_M"     ,"mdl_HBO_H"     ,"mdl_NHBB_L"   ,"mdl_NHBB_M"   ,"mdl_NHBB_H"   ,
"mdl_NHBO_L"    ,"mdl_NHBO_M"    ,"mdl_NHBO_H"    ,"mdl_HBW"      ,"mdl_HBS"      ,"mdl_HBO"      ,
"mdl_NHBB"      ,"mdl_NHBO"      ,"mdl_L"         ,"mdl_M"        ,"mdl_H"        ,"mdl_ALL"
)

Zrobiłem wiele prób, ale nie mógł go prawidłowo wydobyć. Poniżej znajduje się przykład mojego skryptu:

df %>% mutate(income=str_extract_all(string=name,
        pattern="(?!str|jtp|mdl|HBW|HBS|HBO|NHBB|NHBO|_)[L|M|H|(ALL)](?!NMT|MC|CAR|PT|_)"))

Czy masz jakieś sugestię, aby wyodrębnić żądane wyjście jako pokaż poniżej? Wolę używać tidyverse i stringr, ale base funkcja ma również zastosowanie zamiast stringr.

      name          income
1    str_HBW_L_NMT  L
2    str_HBW_M_NMT  M
3    str_HBW_H_NMT  H
4     str_HBW_L_MC  L
5     str_HBW_M_MC  M
6     str_HBW_H_MC  H
7    str_HBW_L_CAR  L
8    str_HBW_M_CAR  M
9    str_HBW_H_CAR  H
10    str_HBW_L_PT  L
11    str_HBW_M_PT  M
12    str_HBW_H_PT  H
13   str_HBS_L_NMT  L
14   str_HBS_M_NMT  M
15   str_HBS_H_NMT  H
16    str_HBS_L_MC  L
17    str_HBS_M_MC  M
18    str_HBS_H_MC  H
19   str_HBS_L_CAR  L
20   str_HBS_M_CAR  M
21   str_HBS_H_CAR  H
22    str_HBS_L_PT  L
23    str_HBS_M_PT  M
24    str_HBS_H_PT  H
25   str_HBO_L_NMT  L
26   str_HBO_M_NMT  M
27   str_HBO_H_NMT  H
28    str_HBO_L_MC  L
29    str_HBO_M_MC  M
30    str_HBO_H_MC  H
31   str_HBO_L_CAR  L
32   str_HBO_M_CAR  M
33   str_HBO_H_CAR  H
34    str_HBO_L_PT  L
35    str_HBO_M_PT  M
36    str_HBO_H_PT  H
37  str_NHBB_L_NMT  L
38  str_NHBB_M_NMT  M
39  str_NHBB_H_NMT  H
40   str_NHBB_L_MC  L
41   str_NHBB_M_MC  M
42   str_NHBB_H_MC  H
43  str_NHBB_L_CAR  L
44  str_NHBB_M_CAR  M
45  str_NHBB_H_CAR  H
46   str_NHBB_L_PT  L
47   str_NHBB_M_PT  M
48   str_NHBB_H_PT  H
49  str_NHBO_L_NMT  L
50  str_NHBO_M_NMT  M
51  str_NHBO_H_NMT  H
52   str_NHBO_L_MC  L
53   str_NHBO_M_MC  M
54   str_NHBO_H_MC  H
55  str_NHBO_L_CAR  L
56  str_NHBO_M_CAR  M
57  str_NHBO_H_CAR  H
58   str_NHBO_L_PT  L
59   str_NHBO_M_PT  M
60   str_NHBO_H_PT  H
61       str_HBW_L  L
62       str_HBW_M  M
63       str_HBW_H  H
64       str_HBS_L  L
65       str_HBS_M  M
66       str_HBS_H  H
67       str_HBO_L  L
68       str_HBO_M  M
69       str_HBO_H  H
70      str_NHBB_L  L
71      str_NHBB_M  M
72      str_NHBB_H  H
73      str_NHBO_L  L
74      str_NHBO_M  M
75      str_NHBO_H  H
76         str_HBW  <N/A>
77         str_HBS  <N/A>
78         str_HBO  <N/A>
79        str_NHBB  <N/A>
80        str_NHBO  <N/A>
81           str_L  L
82           str_M  M
83           str_H  H
84         str_ALL  ALL
85   jtp_HBW_L_NMT  L
86   jtp_HBW_M_NMT  M
87   jtp_HBW_H_NMT  H
88    jtp_HBW_L_MC  L
89    jtp_HBW_M_MC  M
90    jtp_HBW_H_MC  H
91   jtp_HBW_L_CAR  L
92   jtp_HBW_M_CAR  M
93   jtp_HBW_H_CAR  H
94    jtp_HBW_L_PT  L
95    jtp_HBW_M_PT  M
96    jtp_HBW_H_PT  H
97   jtp_HBS_L_NMT  L
98   jtp_HBS_M_NMT  M
99   jtp_HBS_H_NMT  H
100   jtp_HBS_L_MC  L
101   jtp_HBS_M_MC  M
102   jtp_HBS_H_MC  H
103  jtp_HBS_L_CAR  L
104  jtp_HBS_M_CAR  M
105  jtp_HBS_H_CAR  H
106   jtp_HBS_L_PT  L
107   jtp_HBS_M_PT  M
108   jtp_HBS_H_PT  H
109      jtp_HBW_L  L
110      jtp_HBW_M  M
111      jtp_HBW_H  H
112      jtp_HBS_L  L
113      jtp_HBS_M  M
114      jtp_HBS_H  H
115        jtp_HBW  <N/A>
116        jtp_HBS  <N/A>
117          jtp_L  L
118          jtp_M  M
119          jtp_H  H
120        jtp_ALL  ALL
121  mdl_HBW_L_NMT  L
122  mdl_HBW_M_NMT  M
123  mdl_HBW_H_NMT  H
124   mdl_HBW_L_MC  L
125   mdl_HBW_M_MC  M
126   mdl_HBW_H_MC  H
127  mdl_HBW_L_CAR  L
128  mdl_HBW_M_CAR  M
129  mdl_HBW_H_CAR  H
130   mdl_HBW_L_PT  L
131   mdl_HBW_M_PT  M
132   mdl_HBW_H_PT  H
133  mdl_HBS_L_NMT  L
134  mdl_HBS_M_NMT  M
135  mdl_HBS_H_NMT  H
136   mdl_HBS_L_MC  L
137   mdl_HBS_M_MC  M
138   mdl_HBS_H_MC  H
139  mdl_HBS_L_CAR  L
140  mdl_HBS_M_CAR  M
141  mdl_HBS_H_CAR  H
142   mdl_HBS_L_PT  L
143   mdl_HBS_M_PT  M
144   mdl_HBS_H_PT  H
145  mdl_HBO_L_NMT  L
146  mdl_HBO_M_NMT  M
147  mdl_HBO_H_NMT  H
148   mdl_HBO_L_MC  L
149   mdl_HBO_M_MC  M
150   mdl_HBO_H_MC  H
151  mdl_HBO_L_CAR  L
152  mdl_HBO_M_CAR  M
153  mdl_HBO_H_CAR  H
154   mdl_HBO_L_PT  L
155   mdl_HBO_M_PT  M
156   mdl_HBO_H_PT  H
157 mdl_NHBB_L_NMT  L
158 mdl_NHBB_M_NMT  M
159 mdl_NHBB_H_NMT  H
160  mdl_NHBB_L_MC  L
161  mdl_NHBB_M_MC  M
162  mdl_NHBB_H_MC  H
163 mdl_NHBB_L_CAR  L
164 mdl_NHBB_M_CAR  M
165 mdl_NHBB_H_CAR  H
166  mdl_NHBB_L_PT  L
167  mdl_NHBB_M_PT  M
168  mdl_NHBB_H_PT  H
169 mdl_NHBO_L_NMT  L
170 mdl_NHBO_M_NMT  M
171 mdl_NHBO_H_NMT  H
172  mdl_NHBO_L_MC  L
173  mdl_NHBO_M_MC  M
174  mdl_NHBO_H_MC  H
175 mdl_NHBO_L_CAR  L
176 mdl_NHBO_M_CAR  M
177 mdl_NHBO_H_CAR  H
178  mdl_NHBO_L_PT  L
179  mdl_NHBO_M_PT  M
180  mdl_NHBO_H_PT  H
181      mdl_HBW_L  L
182      mdl_HBW_M  M
183      mdl_HBW_H  H
184      mdl_HBS_L  L
185      mdl_HBS_M  M
186      mdl_HBS_H  H
187      mdl_HBO_L  L
188      mdl_HBO_M  M
189      mdl_HBO_H  H
190     mdl_NHBB_L  L
191     mdl_NHBB_M  M
192     mdl_NHBB_H  H
193     mdl_NHBO_L  L
194     mdl_NHBO_M  M
195     mdl_NHBO_H  H
196        mdl_HBW  <N/A>
197        mdl_HBS  <N/A>
198        mdl_HBO  <N/A>
199       mdl_NHBB  <N/A>
200       mdl_NHBO  <N/A>
201          mdl_L  L
202          mdl_M  M
203          mdl_H  H
204        mdl_ALL  ALL

============== Nowy przykład ramy danych ========== Rekordy zawierające tylko c lub d zostały dodane na górze oryginalnego DF.

df <- c(
"NMT","MC","CAR","PT","L","M","H","ALL",
"str_HBW_L_NMT" ,"str_HBW_M_NMT" ,"str_HBW_H_NMT" ,"str_HBW_L_MC" ,"str_HBW_M_MC" ,"str_HBW_H_MC" ,
"str_HBW_L_CAR" ,"str_HBW_M_CAR" ,"str_HBW_H_CAR" ,"str_HBW_L_PT" ,"str_HBW_M_PT" ,"str_HBW_H_PT" ,
"str_HBS_L_NMT" ,"str_HBS_M_NMT" ,"str_HBS_H_NMT" ,"str_HBS_L_MC" ,"str_HBS_M_MC" ,"str_HBS_H_MC" ,
"str_HBS_L_CAR" ,"str_HBS_M_CAR" ,"str_HBS_H_CAR" ,"str_HBS_L_PT" ,"str_HBS_M_PT" ,"str_HBS_H_PT" ,
"str_HBO_L_NMT" ,"str_HBO_M_NMT" ,"str_HBO_H_NMT" ,"str_HBO_L_MC" ,"str_HBO_M_MC" ,"str_HBO_H_MC" ,
"str_HBO_L_CAR" ,"str_HBO_M_CAR" ,"str_HBO_H_CAR" ,"str_HBO_L_PT" ,"str_HBO_M_PT" ,"str_HBO_H_PT" ,
"str_NHBB_L_NMT","str_NHBB_M_NMT","str_NHBB_H_NMT","str_NHBB_L_MC","str_NHBB_M_MC","str_NHBB_H_MC",
"str_NHBB_L_CAR","str_NHBB_M_CAR","str_NHBB_H_CAR","str_NHBB_L_PT","str_NHBB_M_PT","str_NHBB_H_PT",
"str_NHBO_L_NMT","str_NHBO_M_NMT","str_NHBO_H_NMT","str_NHBO_L_MC","str_NHBO_M_MC","str_NHBO_H_MC",
"str_NHBO_L_CAR","str_NHBO_M_CAR","str_NHBO_H_CAR","str_NHBO_L_PT","str_NHBO_M_PT","str_NHBO_H_PT",
"str_HBW_L"     ,"str_HBW_M"     ,"str_HBW_H"     ,"str_HBS_L"    ,"str_HBS_M"    ,"str_HBS_H"    ,
"str_HBO_L"     ,"str_HBO_M"     ,"str_HBO_H"     ,"str_NHBB_L"   ,"str_NHBB_M"   ,"str_NHBB_H"   ,
"str_NHBO_L"    ,"str_NHBO_M"    ,"str_NHBO_H"    ,"str_HBW"      ,"str_HBS"      ,"str_HBO"      ,
"str_NHBB"      ,"str_NHBO"      ,"str_L"         ,"str_M"        ,"str_H"        ,"str_ALL"      ,
"jtp_HBW_L_NMT" ,"jtp_HBW_M_NMT" ,"jtp_HBW_H_NMT" ,"jtp_HBW_L_MC" ,"jtp_HBW_M_MC" ,"jtp_HBW_H_MC" ,
"jtp_HBW_L_CAR" ,"jtp_HBW_M_CAR" ,"jtp_HBW_H_CAR" ,"jtp_HBW_L_PT" ,"jtp_HBW_M_PT" ,"jtp_HBW_H_PT" ,
"jtp_HBS_L_NMT" ,"jtp_HBS_M_NMT" ,"jtp_HBS_H_NMT" ,"jtp_HBS_L_MC" ,"jtp_HBS_M_MC" ,"jtp_HBS_H_MC" ,
"jtp_HBS_L_CAR" ,"jtp_HBS_M_CAR" ,"jtp_HBS_H_CAR" ,"jtp_HBS_L_PT" ,"jtp_HBS_M_PT" ,"jtp_HBS_H_PT" ,
"jtp_HBW_L"     ,"jtp_HBW_M"     ,"jtp_HBW_H"     ,"jtp_HBS_L"    ,"jtp_HBS_M"    ,"jtp_HBS_H"    ,
"jtp_HBW"       ,"jtp_HBS"       ,"jtp_L"         ,"jtp_M"        ,"jtp_H"        ,"jtp_ALL"      ,
"mdl_HBW_L_NMT" ,"mdl_HBW_M_NMT" ,"mdl_HBW_H_NMT" ,"mdl_HBW_L_MC" ,"mdl_HBW_M_MC" ,"mdl_HBW_H_MC" ,
"mdl_HBW_L_CAR" ,"mdl_HBW_M_CAR" ,"mdl_HBW_H_CAR" ,"mdl_HBW_L_PT" ,"mdl_HBW_M_PT" ,"mdl_HBW_H_PT" ,
"mdl_HBS_L_NMT" ,"mdl_HBS_M_NMT" ,"mdl_HBS_H_NMT" ,"mdl_HBS_L_MC" ,"mdl_HBS_M_MC" ,"mdl_HBS_H_MC" ,
"mdl_HBS_L_CAR" ,"mdl_HBS_M_CAR" ,"mdl_HBS_H_CAR" ,"mdl_HBS_L_PT" ,"mdl_HBS_M_PT" ,"mdl_HBS_H_PT" ,
"mdl_HBO_L_NMT" ,"mdl_HBO_M_NMT" ,"mdl_HBO_H_NMT" ,"mdl_HBO_L_MC" ,"mdl_HBO_M_MC" ,"mdl_HBO_H_MC" ,
"mdl_HBO_L_CAR" ,"mdl_HBO_M_CAR" ,"mdl_HBO_H_CAR" ,"mdl_HBO_L_PT" ,"mdl_HBO_M_PT" ,"mdl_HBO_H_PT" ,
"mdl_NHBB_L_NMT","mdl_NHBB_M_NMT","mdl_NHBB_H_NMT","mdl_NHBB_L_MC","mdl_NHBB_M_MC","mdl_NHBB_H_MC",
"mdl_NHBB_L_CAR","mdl_NHBB_M_CAR","mdl_NHBB_H_CAR","mdl_NHBB_L_PT","mdl_NHBB_M_PT","mdl_NHBB_H_PT",
"mdl_NHBO_L_NMT","mdl_NHBO_M_NMT","mdl_NHBO_H_NMT","mdl_NHBO_L_MC","mdl_NHBO_M_MC","mdl_NHBO_H_MC",
"mdl_NHBO_L_CAR","mdl_NHBO_M_CAR","mdl_NHBO_H_CAR","mdl_NHBO_L_PT","mdl_NHBO_M_PT","mdl_NHBO_H_PT",
"mdl_HBW_L"     ,"mdl_HBW_M"     ,"mdl_HBW_H"     ,"mdl_HBS_L"    ,"mdl_HBS_M"    ,"mdl_HBS_H"    ,
"mdl_HBO_L"     ,"mdl_HBO_M"     ,"mdl_HBO_H"     ,"mdl_NHBB_L"   ,"mdl_NHBB_M"   ,"mdl_NHBB_H"   ,
"mdl_NHBO_L"    ,"mdl_NHBO_M"    ,"mdl_NHBO_H"    ,"mdl_HBW"      ,"mdl_HBS"      ,"mdl_HBO"      ,
"mdl_NHBB"      ,"mdl_NHBO"      ,"mdl_L"         ,"mdl_M"        ,"mdl_H"        ,"mdl_ALL"
1
Hideo.S 4 czerwiec 2018, 11:58

3 odpowiedzi

Najlepsza odpowiedź

Możesz utworzyć regex, który będzie pasował do wszystkich swoich wejść za pomocą opcjonalnych grupami dla b, c i d i d, usuń wszystkie oprócz części c z ciągu, a następnie zastąp pusty przedmioty z NA s:

res <- sub("^(?:(?:str|jtp|mdl)(?:_|$))?(?:(?:HB[WSO]|NHB[BO])(?:_|$))?(?:([LMH]|ALL)(?:_|$))?(?:NMT|MC|CAR|PT)?$", "\\1", df)
res[nchar(res)==0] <- NA
> res
  [1] NA    NA    NA    NA    "L"   "M"   "H"   "ALL" "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"  
 [29] "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"  
 [57] "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   NA   
 [85] NA    NA    NA    NA    "L"   "M"   "H"   "ALL" "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"  
[113] "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   NA    NA    "L"   "M"   "H"   "ALL" "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"  
[141] "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"  
[169] "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"   "H"   "L"   "M"  
[197] "H"   "L"   "M"   "H"   "L"   "M"   "H"   NA    NA    NA    NA    NA    "L"   "M"   "H"   "ALL"

Zobacz REGEX DEMO.

Jak widać, nie potrzebujesz tutaj żadnego dodatkowego pakietu.

Szczegóły wzoru

  • ^ - początek ciągu
  • (?:(?:str|jtp|mdl)(?:_|$))? - a część (opcjonalna grupa): str, jtp lub mdl a następnie mdl lub koniec ciągów ({{x6 }})
  • (?:(?:HB[WSO]|NHB[BO])(?:_|$))? - część b (a opcjonalna grupa): Opcjonalna niepopłatna grupowa grupowa _, a następnie HBW, HBS, {x5}} , NHBB lub NHBO a następnie _ lub koniec łańcucha ((?:_|$))
  • (?:_([LMH]|ALL))? - Część c Część: Opcjonalna niepokój pasuje do grupy _, a następnie L, M, H lub {{x6} lub ALL } Następnie za pomocą _ lub koniec ciągu ((?:_|$))
  • (?:NMT|MC|CAR|PT)? - A d Część: Opcjonalna niezapruchająca się pasuje do grupy NMT, MC, CAR lub PT
  • $ - koniec ciągu.

\\1 (String {X1}}) Zastępca w wzorze zamiennym przywraca grupę 1 (c wzór).

2
Wiktor Stribiżew 11 czerwiec 2018, 08:55

Dokonałbym te dane w dataframe, dzieląc łańcuch do kolumn. To znacznie łatwiej pracować.

require(stringr)

l <- str_split(df, pattern = "_") 

df2 <- as.data.frame(do.call(rbind, lapply(l, function(x) {length(x) <- 4
x})))

STR (DF2)

Prowadzi do:

'data.frame':   204 obs. of  4 variables:
$ V1: Factor w/ 3 levels "jtp","mdl","str": 3 3 3 3 3 3 3 3 3 3 ...
$ V2: Factor w/ 9 levels "ALL","H","HBO",..: 5 5 5 5 5 5 5 5 5 5 ...
$ V3: Factor w/ 3 levels "H","L","M": 2 3 1 2 3 1 2 3 1 2 ...
$ V4: Factor w/ 4 levels "CAR","MC","NMT",..: 3 3 3 2 2 2 1 1 1 4 ...
0
Wietze314 4 czerwiec 2018, 09:11

Nieco prostsze wyrażenia regularne

library(stringr)
library(dplyr)

# Create pattern to look for
patt <- paste0("_", c("L", "M", "H", "ALL"), c(rep("$", 4), rep("_", 4)))
patt <- paste(patt, collapse = "|")
patt
[1] "_L$|_M$|_H$|_ALL$|_L_|_M_|_H_|_ALL_"

# Extract pattern and clean
df$income <- str_extract(df$name, patt) %>% 
  str_replace_all("_", "")

head(df)
         name income
str_HBW_L_NMT      L
str_HBW_M_NMT      M
str_HBW_H_NMT      H
 str_HBW_L_MC      L
 str_HBW_M_MC      M
 str_HBW_H_MC      H

Lub Unikaj REGEX / DPLYR / Stringr całkowicie :

df$income <- 
  strsplit(df$name, split = "_", fixed = TRUE) %>% 
  lapply(
    function(x) {
      our_el <- x %in% c("L", "M", "H", "ALL")
      ifelse(!any(our_el), NA, x[our_el])
    }
  )
0
sindri_baldur 4 czerwiec 2018, 11:41