libstdc++
regex.tcc
1 // class template regex -*- C++ -*-
2 
3 // Copyright (C) 2013-2014 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /**
26  * @file bits/regex.tcc
27  * This is an internal header file, included by other library headers.
28  * Do not attempt to use it directly. @headername{regex}
29  */
30 
31 // See below __regex_algo_impl to get what this is talking about. The default
32 // value 1 indicated a conservative optimization without giving up worst case
33 // performance.
34 #ifndef _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT
35 #define _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT 1
36 #endif
37 
38 namespace std _GLIBCXX_VISIBILITY(default)
39 {
40 namespace __detail
41 {
42 _GLIBCXX_BEGIN_NAMESPACE_VERSION
43 
44  // Result of merging regex_match and regex_search.
45  //
46  // __policy now can be _S_auto (auto dispatch) and _S_alternate (use
47  // the other one if possible, for test purpose).
48  //
49  // That __match_mode is true means regex_match, else regex_search.
50  template<typename _BiIter, typename _Alloc,
51  typename _CharT, typename _TraitsT,
52  _RegexExecutorPolicy __policy,
53  bool __match_mode>
54  bool
55  __regex_algo_impl(_BiIter __s,
56  _BiIter __e,
57  match_results<_BiIter, _Alloc>& __m,
58  const basic_regex<_CharT, _TraitsT>& __re,
59  regex_constants::match_flag_type __flags)
60  {
61  if (__re._M_automaton == nullptr)
62  return false;
63 
64  typename match_results<_BiIter, _Alloc>::_Base_type& __res = __m;
65  __res.resize(__re._M_automaton->_M_sub_count() + 2);
66  for (auto& __it : __res)
67  __it.matched = false;
68 
69  // This function decide which executor to use under given circumstances.
70  // The _S_auto policy now is the following: if a NFA has no
71  // back-references and has more than _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT
72  // quantifiers (*, +, ?), the BFS executor will be used, other wise
73  // DFS executor. This is because DFS executor has a exponential upper
74  // bound, but better best-case performace. Meanwhile, BFS executor can
75  // effectively prevent from exponential-long time matching (which must
76  // contains many quantifiers), but it's slower in average.
77  //
78  // For simple regex, BFS executor could be 2 or more times slower than
79  // DFS executor.
80  //
81  // Of course, BFS executor cannot handle back-references.
82  bool __ret;
83  if (!__re._M_automaton->_M_has_backref
84  && (__policy == _RegexExecutorPolicy::_S_alternate
85  || __re._M_automaton->_M_quant_count
86  > _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT))
87  {
88  _Executor<_BiIter, _Alloc, _TraitsT, false>
89  __executor(__s, __e, __m, __re, __flags);
90  if (__match_mode)
91  __ret = __executor._M_match();
92  else
93  __ret = __executor._M_search();
94  }
95  else
96  {
97  _Executor<_BiIter, _Alloc, _TraitsT, true>
98  __executor(__s, __e, __m, __re, __flags);
99  if (__match_mode)
100  __ret = __executor._M_match();
101  else
102  __ret = __executor._M_search();
103  }
104  if (__ret)
105  {
106  for (auto __it : __res)
107  if (!__it.matched)
108  __it.first = __it.second = __e;
109  auto& __pre = __res[__res.size()-2];
110  auto& __suf = __res[__res.size()-1];
111  if (__match_mode)
112  {
113  __pre.matched = false;
114  __pre.first = __s;
115  __pre.second = __s;
116  __suf.matched = false;
117  __suf.first = __e;
118  __suf.second = __e;
119  }
120  else
121  {
122  __pre.first = __s;
123  __pre.second = __res[0].first;
124  __pre.matched = (__pre.first != __pre.second);
125  __suf.first = __res[0].second;
126  __suf.second = __e;
127  __suf.matched = (__suf.first != __suf.second);
128  }
129  }
130  return __ret;
131  }
132 
133 _GLIBCXX_END_NAMESPACE_VERSION
134 }
135 
136 _GLIBCXX_BEGIN_NAMESPACE_VERSION
137 
138  template<typename _Ch_type>
139  template<typename _Fwd_iter>
140  typename regex_traits<_Ch_type>::string_type
141  regex_traits<_Ch_type>::
142  lookup_collatename(_Fwd_iter __first, _Fwd_iter __last) const
143  {
144  typedef std::ctype<char_type> __ctype_type;
145  const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
146 
147  static const char* __collatenames[] =
148  {
149  "NUL",
150  "SOH",
151  "STX",
152  "ETX",
153  "EOT",
154  "ENQ",
155  "ACK",
156  "alert",
157  "backspace",
158  "tab",
159  "newline",
160  "vertical-tab",
161  "form-feed",
162  "carriage-return",
163  "SO",
164  "SI",
165  "DLE",
166  "DC1",
167  "DC2",
168  "DC3",
169  "DC4",
170  "NAK",
171  "SYN",
172  "ETB",
173  "CAN",
174  "EM",
175  "SUB",
176  "ESC",
177  "IS4",
178  "IS3",
179  "IS2",
180  "IS1",
181  "space",
182  "exclamation-mark",
183  "quotation-mark",
184  "number-sign",
185  "dollar-sign",
186  "percent-sign",
187  "ampersand",
188  "apostrophe",
189  "left-parenthesis",
190  "right-parenthesis",
191  "asterisk",
192  "plus-sign",
193  "comma",
194  "hyphen",
195  "period",
196  "slash",
197  "zero",
198  "one",
199  "two",
200  "three",
201  "four",
202  "five",
203  "six",
204  "seven",
205  "eight",
206  "nine",
207  "colon",
208  "semicolon",
209  "less-than-sign",
210  "equals-sign",
211  "greater-than-sign",
212  "question-mark",
213  "commercial-at",
214  "A",
215  "B",
216  "C",
217  "D",
218  "E",
219  "F",
220  "G",
221  "H",
222  "I",
223  "J",
224  "K",
225  "L",
226  "M",
227  "N",
228  "O",
229  "P",
230  "Q",
231  "R",
232  "S",
233  "T",
234  "U",
235  "V",
236  "W",
237  "X",
238  "Y",
239  "Z",
240  "left-square-bracket",
241  "backslash",
242  "right-square-bracket",
243  "circumflex",
244  "underscore",
245  "grave-accent",
246  "a",
247  "b",
248  "c",
249  "d",
250  "e",
251  "f",
252  "g",
253  "h",
254  "i",
255  "j",
256  "k",
257  "l",
258  "m",
259  "n",
260  "o",
261  "p",
262  "q",
263  "r",
264  "s",
265  "t",
266  "u",
267  "v",
268  "w",
269  "x",
270  "y",
271  "z",
272  "left-curly-bracket",
273  "vertical-line",
274  "right-curly-bracket",
275  "tilde",
276  "DEL",
277  ""
278  };
279 
280  // same as boost
281  //static const char* __digraphs[] =
282  // {
283  // "ae",
284  // "Ae",
285  // "AE",
286  // "ch",
287  // "Ch",
288  // "CH",
289  // "ll",
290  // "Ll",
291  // "LL",
292  // "ss",
293  // "Ss",
294  // "SS",
295  // "nj",
296  // "Nj",
297  // "NJ",
298  // "dz",
299  // "Dz",
300  // "DZ",
301  // "lj",
302  // "Lj",
303  // "LJ",
304  // ""
305  // };
306 
307  std::string __s(__last - __first, '?');
308  __fctyp.narrow(__first, __last, '?', &*__s.begin());
309 
310  for (unsigned int __i = 0; *__collatenames[__i]; __i++)
311  if (__s == __collatenames[__i])
312  return string_type(1, __fctyp.widen(static_cast<char>(__i)));
313 
314  //for (unsigned int __i = 0; *__digraphs[__i]; __i++)
315  // {
316  // const char* __now = __digraphs[__i];
317  // if (__s == __now)
318  // {
319  // string_type ret(__s.size(), __fctyp.widen('?'));
320  // __fctyp.widen(__now, __now + 2/* ouch */, &*ret.begin());
321  // return ret;
322  // }
323  // }
324  return string_type();
325  }
326 
327  template<typename _Ch_type>
328  template<typename _Fwd_iter>
329  typename regex_traits<_Ch_type>::char_class_type
330  regex_traits<_Ch_type>::
331  lookup_classname(_Fwd_iter __first, _Fwd_iter __last, bool __icase) const
332  {
333  typedef std::ctype<char_type> __ctype_type;
334  typedef std::ctype<char> __cctype_type;
335  typedef const pair<const char*, char_class_type> _ClassnameEntry;
336  const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
337  const __cctype_type& __cctyp(use_facet<__cctype_type>(_M_locale));
338 
339  static _ClassnameEntry __classnames[] =
340  {
341  {"d", ctype_base::digit},
342  {"w", {ctype_base::alnum, _RegexMask::_S_under}},
343  {"s", ctype_base::space},
344  {"alnum", ctype_base::alnum},
345  {"alpha", ctype_base::alpha},
346  {"blank", {0, _RegexMask::_S_blank}},
347  {"cntrl", ctype_base::cntrl},
348  {"digit", ctype_base::digit},
349  {"graph", ctype_base::graph},
350  {"lower", ctype_base::lower},
351  {"print", ctype_base::print},
352  {"punct", ctype_base::punct},
353  {"space", ctype_base::space},
354  {"upper", ctype_base::upper},
355  {"xdigit", ctype_base::xdigit},
356  };
357 
358  std::string __s(__last - __first, '?');
359  __fctyp.narrow(__first, __last, '?', &__s[0]);
360  __cctyp.tolower(&*__s.begin(), &*__s.begin() + __s.size());
361  for (_ClassnameEntry* __it = __classnames;
362  __it < *(&__classnames + 1);
363  ++__it)
364  {
365  if (__s == __it->first)
366  {
367  if (__icase
368  && ((__it->second
369  & (ctype_base::lower | ctype_base::upper)) != 0))
370  return ctype_base::alpha;
371  return __it->second;
372  }
373  }
374  return 0;
375  }
376 
377  template<typename _Ch_type>
378  bool
379  regex_traits<_Ch_type>::
380  isctype(_Ch_type __c, char_class_type __f) const
381  {
382  typedef std::ctype<char_type> __ctype_type;
383  const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
384 
385  return __fctyp.is(__f._M_base, __c)
386  // [[:w:]]
387  || ((__f._M_extended & _RegexMask::_S_under)
388  && __c == __fctyp.widen('_'))
389  // [[:blank:]]
390  || ((__f._M_extended & _RegexMask::_S_blank)
391  && (__c == __fctyp.widen(' ')
392  || __c == __fctyp.widen('\t')));
393  }
394 
395  template<typename _Ch_type>
396  int
397  regex_traits<_Ch_type>::
398  value(_Ch_type __ch, int __radix) const
399  {
400  std::basic_istringstream<char_type> __is(string_type(1, __ch));
401  long __v;
402  if (__radix == 8)
403  __is >> std::oct;
404  else if (__radix == 16)
405  __is >> std::hex;
406  __is >> __v;
407  return __is.fail() ? -1 : __v;
408  }
409 
410  template<typename _Bi_iter, typename _Alloc>
411  template<typename _Out_iter>
412  _Out_iter match_results<_Bi_iter, _Alloc>::
413  format(_Out_iter __out,
414  const match_results<_Bi_iter, _Alloc>::char_type* __fmt_first,
415  const match_results<_Bi_iter, _Alloc>::char_type* __fmt_last,
416  match_flag_type __flags) const
417  {
418  _GLIBCXX_DEBUG_ASSERT( ready() );
419  regex_traits<char_type> __traits;
420  typedef std::ctype<char_type> __ctype_type;
421  const __ctype_type&
422  __fctyp(use_facet<__ctype_type>(__traits.getloc()));
423 
424  auto __output = [&](size_t __idx)
425  {
426  auto& __sub = _Base_type::operator[](__idx);
427  if (__sub.matched)
428  __out = std::copy(__sub.first, __sub.second, __out);
429  };
430 
431  if (__flags & regex_constants::format_sed)
432  {
433  for (; __fmt_first != __fmt_last;)
434  if (*__fmt_first == '&')
435  {
436  __output(0);
437  ++__fmt_first;
438  }
439  else if (*__fmt_first == '\\')
440  {
441  if (++__fmt_first != __fmt_last
442  && __fctyp.is(__ctype_type::digit, *__fmt_first))
443  __output(__traits.value(*__fmt_first++, 10));
444  else
445  *__out++ = '\\';
446  }
447  else
448  *__out++ = *__fmt_first++;
449  }
450  else
451  {
452  while (1)
453  {
454  auto __next = std::find(__fmt_first, __fmt_last, '$');
455  if (__next == __fmt_last)
456  break;
457 
458  __out = std::copy(__fmt_first, __next, __out);
459 
460  auto __eat = [&](char __ch) -> bool
461  {
462  if (*__next == __ch)
463  {
464  ++__next;
465  return true;
466  }
467  return false;
468  };
469 
470  if (++__next == __fmt_last)
471  *__out++ = '$';
472  else if (__eat('$'))
473  *__out++ = '$';
474  else if (__eat('&'))
475  __output(0);
476  else if (__eat('`'))
477  __output(_Base_type::size()-2);
478  else if (__eat('\''))
479  __output(_Base_type::size()-1);
480  else if (__fctyp.is(__ctype_type::digit, *__next))
481  {
482  long __num = __traits.value(*__next, 10);
483  if (++__next != __fmt_last
484  && __fctyp.is(__ctype_type::digit, *__next))
485  {
486  __num *= 10;
487  __num += __traits.value(*__next++, 10);
488  }
489  if (0 <= __num && __num < this->size())
490  __output(__num);
491  }
492  else
493  *__out++ = '$';
494  __fmt_first = __next;
495  }
496  __out = std::copy(__fmt_first, __fmt_last, __out);
497  }
498  return __out;
499  }
500 
501  template<typename _Out_iter, typename _Bi_iter,
502  typename _Rx_traits, typename _Ch_type>
503  _Out_iter
504  regex_replace(_Out_iter __out, _Bi_iter __first, _Bi_iter __last,
505  const basic_regex<_Ch_type, _Rx_traits>& __e,
506  const _Ch_type* __fmt,
507  regex_constants::match_flag_type __flags)
508  {
509  typedef regex_iterator<_Bi_iter, _Ch_type, _Rx_traits> _IterT;
510  _IterT __i(__first, __last, __e, __flags);
511  _IterT __end;
512  if (__i == __end)
513  {
514  if (!(__flags & regex_constants::format_no_copy))
515  __out = std::copy(__first, __last, __out);
516  }
517  else
518  {
519  sub_match<_Bi_iter> __last;
520  auto __len = char_traits<_Ch_type>::length(__fmt);
521  for (; __i != __end; ++__i)
522  {
523  if (!(__flags & regex_constants::format_no_copy))
524  __out = std::copy(__i->prefix().first, __i->prefix().second,
525  __out);
526  __out = __i->format(__out, __fmt, __fmt + __len, __flags);
527  __last = __i->suffix();
528  if (__flags & regex_constants::format_first_only)
529  break;
530  }
531  if (!(__flags & regex_constants::format_no_copy))
532  __out = std::copy(__last.first, __last.second, __out);
533  }
534  return __out;
535  }
536 
537  template<typename _Bi_iter,
538  typename _Ch_type,
539  typename _Rx_traits>
540  bool
541  regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
542  operator==(const regex_iterator& __rhs) const
543  {
544  return (_M_match.empty() && __rhs._M_match.empty())
545  || (_M_begin == __rhs._M_begin
546  && _M_end == __rhs._M_end
547  && _M_pregex == __rhs._M_pregex
548  && _M_flags == __rhs._M_flags
549  && _M_match[0] == __rhs._M_match[0]);
550  }
551 
552  template<typename _Bi_iter,
553  typename _Ch_type,
554  typename _Rx_traits>
555  regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>&
556  regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
557  operator++()
558  {
559  // In all cases in which the call to regex_search returns true,
560  // match.prefix().first shall be equal to the previous value of
561  // match[0].second, and for each index i in the half-open range
562  // [0, match.size()) for which match[i].matched is true,
563  // match[i].position() shall return distance(begin, match[i].first).
564  // [28.12.1.4.5]
565  if (_M_match[0].matched)
566  {
567  auto __start = _M_match[0].second;
568  auto __prefix_first = _M_match[0].second;
569  if (_M_match[0].first == _M_match[0].second)
570  {
571  if (__start == _M_end)
572  {
573  _M_match = value_type();
574  return *this;
575  }
576  else
577  {
578  if (regex_search(__start, _M_end, _M_match, *_M_pregex,
579  _M_flags
580  | regex_constants::match_not_null
581  | regex_constants::match_continuous))
582  {
583  _GLIBCXX_DEBUG_ASSERT(_M_match[0].matched);
584  _M_match.at(_M_match.size()).first = __prefix_first;
585  _M_match._M_in_iterator = true;
586  _M_match._M_begin = _M_begin;
587  return *this;
588  }
589  else
590  ++__start;
591  }
592  }
593  _M_flags |= regex_constants::match_prev_avail;
594  if (regex_search(__start, _M_end, _M_match, *_M_pregex, _M_flags))
595  {
596  _GLIBCXX_DEBUG_ASSERT(_M_match[0].matched);
597  _M_match.at(_M_match.size()).first = __prefix_first;
598  _M_match._M_in_iterator = true;
599  _M_match._M_begin = _M_begin;
600  }
601  else
602  _M_match = value_type();
603  }
604  return *this;
605  }
606 
607  template<typename _Bi_iter,
608  typename _Ch_type,
609  typename _Rx_traits>
610  regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>&
611  regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
612  operator=(const regex_token_iterator& __rhs)
613  {
614  _M_position = __rhs._M_position;
615  _M_subs = __rhs._M_subs;
616  _M_n = __rhs._M_n;
617  _M_result = __rhs._M_result;
618  _M_suffix = __rhs._M_suffix;
619  _M_has_m1 = __rhs._M_has_m1;
620  if (__rhs._M_result == &__rhs._M_suffix)
621  _M_result = &_M_suffix;
622  return *this;
623  }
624 
625  template<typename _Bi_iter,
626  typename _Ch_type,
627  typename _Rx_traits>
628  bool
629  regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
630  operator==(const regex_token_iterator& __rhs) const
631  {
632  if (_M_end_of_seq() && __rhs._M_end_of_seq())
633  return true;
634  if (_M_suffix.matched && __rhs._M_suffix.matched
635  && _M_suffix == __rhs._M_suffix)
636  return true;
637  if (_M_end_of_seq() || _M_suffix.matched
638  || __rhs._M_end_of_seq() || __rhs._M_suffix.matched)
639  return false;
640  return _M_position == __rhs._M_position
641  && _M_n == __rhs._M_n
642  && _M_subs == __rhs._M_subs;
643  }
644 
645  template<typename _Bi_iter,
646  typename _Ch_type,
647  typename _Rx_traits>
648  regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>&
649  regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
650  operator++()
651  {
652  _Position __prev = _M_position;
653  if (_M_suffix.matched)
654  *this = regex_token_iterator();
655  else if (_M_n + 1 < _M_subs.size())
656  {
657  _M_n++;
658  _M_result = &_M_current_match();
659  }
660  else
661  {
662  _M_n = 0;
663  ++_M_position;
664  if (_M_position != _Position())
665  _M_result = &_M_current_match();
666  else if (_M_has_m1 && __prev->suffix().length() != 0)
667  {
668  _M_suffix.matched = true;
669  _M_suffix.first = __prev->suffix().first;
670  _M_suffix.second = __prev->suffix().second;
671  _M_result = &_M_suffix;
672  }
673  else
674  *this = regex_token_iterator();
675  }
676  return *this;
677  }
678 
679  template<typename _Bi_iter,
680  typename _Ch_type,
681  typename _Rx_traits>
682  void
683  regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>::
684  _M_init(_Bi_iter __a, _Bi_iter __b)
685  {
686  _M_has_m1 = false;
687  for (auto __it : _M_subs)
688  if (__it == -1)
689  {
690  _M_has_m1 = true;
691  break;
692  }
693  if (_M_position != _Position())
694  _M_result = &_M_current_match();
695  else if (_M_has_m1)
696  {
697  _M_suffix.matched = true;
698  _M_suffix.first = __a;
699  _M_suffix.second = __b;
700  _M_result = &_M_suffix;
701  }
702  else
703  _M_result = nullptr;
704  }
705 
706 _GLIBCXX_END_NAMESPACE_VERSION
707 } // namespace
708