Skip to main content

The Private Class Reference

Private members of a regular expression. More...

Declaration

class reg::Ex::Private { ... }

Public Constructors Index

Private (std::string_view pat)

Creates the private part. More...

Public Member Functions Index

voidcompile ()

Compiles a regular expression passed as a string into a stream of tokens that can be used for efficient searching. More...

boolmatchAt (size_t tokenPos, size_t tokenLen, std::string_view str, Match &match, size_t pos, int level) const

Internal matching routine. More...

Public Member Attributes Index

boolerror = false

Flag indicating the expression was successfully compiled. More...

std::vector< PToken >data

The token stream representing the compiled regular expression. More...

std::stringpattern

The pattern string as passed by the user. More...

Description

Private members of a regular expression.

Definition at line 169 of file regex.cpp.

Public Constructors

Private()

reg::Ex::Private::Private (std::string_view pat)
inline

Creates the private part.

Definition at line 173 of file regex.cpp.

173 Private(std::string_view pat) : pattern(pat)
174 {
175 data.reserve(100);
176 }

References data and pattern.

Public Member Functions

compile()

void reg::Ex::Private::compile ()

Compiles a regular expression passed as a string into a stream of tokens that can be used for efficient searching.

Definition at line 177 of file regex.cpp.

198{
199 error = false;
200 data.clear();
201 if (pattern.empty()) return;
202 const char *start = pattern.c_str();
203 const char *ps = start;
204 char c = 0;
205
206 int prevTokenPos=-1;
207 int tokenPos=0;
208
209 auto addToken = [&](PToken tok)
210 {
211 tokenPos++;
212 data.emplace_back(tok);
213 };
214
215 auto getNextCharacter = [&]() -> PToken
216 {
217 char cs=*ps;
218 PToken result = PToken(cs);
219 if (cs=='\\') // escaped character
220 {
221 ps++;
222 cs=*ps;
223 switch (cs)
224 {
225 case 'n': result = PToken('\n'); break;
226 case 'r': result = PToken('\r'); break;
227 case 't': result = PToken('\t'); break;
228 case 's': result = PToken(PToken::Kind::WhiteSpace); break;
229 case 'a': result = PToken(PToken::Kind::Alpha); break;
230 case 'w': result = PToken(PToken::Kind::AlphaNum); break;
231 case 'd': result = PToken(PToken::Kind::Digit); break;
232 case '<': result = PToken(PToken::Kind::BeginOfWord); break;
233 case '>': result = PToken(PToken::Kind::EndOfWord); break;
234 case 'x':
235 case 'X':
236 {
237 uint16_t v=0;
238 for (int i=0;i<2 && (cs=(*(ps+1)));i++) // 2 hex digits
239 {
240 int d = (cs>='a' && cs<='f') ? cs-'a'10 :
241 (cs>='A' && cs<='F') ? cs-'A'10 :
242 (cs>='0' && cs<='9') ? cs-'0' :
243 -1;
244 if (d>=0) { v<<=4; v|=d; ps++; } else break;
245 }
246 result = PToken(v);
247 }
248 break;
249 case '\0': ps--; break; // backslash at the end of the pattern
250 default:
251 result = PToken(cs);
252 break;
253 }
254 }
255 return result;
256 };
257
258 while ((c=*ps))
259 {
260 switch (c)
261 {
262 case '^': // beginning of line (if first character of the pattern)
263 prevTokenPos = tokenPos;
264 addToken(ps==start ? PToken(PToken::Kind::BeginOfLine) :
265 PToken(c));
266 break;
267 case '$': // end of the line (if last character of the pattern)
268 prevTokenPos = tokenPos;
269 addToken(*(ps+1)=='\0' ? PToken(PToken::Kind::EndOfLine) :
270 PToken(c));
271 break;
272 case '.': // any character
273 prevTokenPos = tokenPos;
274 addToken(PToken(PToken::Kind::Any));
275 break;
276 case '(': // begin of capture group
277 prevTokenPos = tokenPos;
279 break;
280 case ')': // end of capture group
281 prevTokenPos = tokenPos;
283 break;
284 case '[': // character class
285 {
286 prevTokenPos = tokenPos;
287 ps++;
288 if (*ps==0) { error=true; return; }
289 bool esc = *ps=='\\';
290 PToken tok = getNextCharacter();
291 ps++;
292 if (!esc && tok.kind()==PToken::Kind::Character &&
293 tok.asciiValue()=='^') // negated character class
294 {
296 if (*ps==0) { error=true; return; }
297 tok = getNextCharacter();
298 ps++;
299 }
300 else
301 {
303 }
304 uint16_t numTokens=0;
305 while ((c=*ps))
306 {
307 if (c=='-' && *(ps+1)!=']' && *(ps+1)!=0) // range
308 {
309 getNextCharacter();
310 ps++;
311 PToken endTok = getNextCharacter();
312 ps++;
313 if (tok.value()>endTok.value())
314 {
315 addToken(PToken(endTok.value(),tok.value())); // swap start and end
316 }
317 else
318 {
319 addToken(PToken(tok.value(),endTok.value()));
320 }
321 numTokens++;
322 }
323 else // single char, from==to
324 {
326 {
327 addToken(PToken(tok.value(),tok.value()));
328 }
329 else // special token, add as-is since from>to
330 {
331 addToken(tok);
332 }
333 numTokens++;
334 }
335 if (*ps==0) { error=true; return; } // expected at least a ]
336 esc = *ps=='\\';
337 tok = getNextCharacter();
338 if (!esc && tok.kind()==PToken::Kind::Character &&
339 tok.value()==static_cast<uint16_t>(']'))
340 {
341 break; // end of character class
342 }
343 if (*ps==0) { error=true; return; } // no ] found
344 ps++;
345 }
346 // set the value of either NegCharClass or CharClass
347 data[prevTokenPos].setValue(numTokens);
348 }
349 break;
350 case '*': // 0 or more
351 case '+': // 1 or more
352 case '?': // optional: 0 or 1
353 {
354 if (prevTokenPos==-1)
355 {
356 error=true;
357 return;
358 }
359 switch (data[prevTokenPos].kind())
360 {
361 case PToken::Kind::BeginOfLine: // $* or $+ or $?
362 case PToken::Kind::BeginOfWord: // \<* or \<+ or \<?
363 case PToken::Kind::EndOfWord: // \>* or \>+ or \>?
364 case PToken::Kind::Star: // ** or *+ or *?
365 case PToken::Kind::Optional: // ?* or ?+ or ??
366 error=true;
367 return;
368 default: // ok
369 break;
370 }
371 int ddiff = static_cast<int>(tokenPos-prevTokenPos);
372 if (*ps=='+') // convert <pat>+ -> <pat><pat>*
373 {
374 // turn a sequence of token [T1...Tn] followed by '+' into [T1..Tn T1..Tn T*]
375 // ddiff=n ^prevTokenPos
376 data.resize(data.size()+ddiff);
377 std::copy_n(data.begin()+prevTokenPos,ddiff,data.begin()+tokenPos);
378 prevTokenPos+=ddiff;
379 tokenPos+=ddiff;
380 }
381 if (data[prevTokenPos].kind()==PToken::Kind::EndCapture)
382 {
383 // find the beginning of the capture range
384 while (prevTokenPos>0 && data[prevTokenPos].kind()!=PToken::Kind::BeginCapture)
385 {
386 prevTokenPos--;
387 }
388 }
389 data.insert(data.begin()+prevTokenPos,
391 tokenPos++;
392 addToken(PToken(PToken::Kind::End));
393 // turn a sequence of tokens [T1 T2 T3] followed by 'T*' or into [T* T1 T2 T3 TEND]
394 // ^prevTokenPos
395 // same for 'T?'.
396 }
397 break;
398 default:
399 prevTokenPos = tokenPos;
400 addToken(getNextCharacter());
401 break;
402 }
403 ps++;
404 }
405 //addToken(PToken(PToken::Kind::End));
406}

References reg::PToken::Alpha, reg::PToken::AlphaNum, reg::PToken::Any, reg::PToken::asciiValue, reg::PToken::BeginCapture, reg::PToken::BeginOfLine, reg::PToken::BeginOfWord, reg::PToken::Character, reg::PToken::CharClass, data, reg::PToken::Digit, reg::PToken::End, reg::PToken::EndCapture, reg::PToken::EndOfLine, reg::PToken::EndOfWord, error, reg::PToken::kind, reg::PToken::NegCharClass, reg::PToken::Optional, pattern, reg::PToken::Star, reg::PToken::value and reg::PToken::WhiteSpace.

matchAt()

bool reg::Ex::Private::matchAt (size_t tokenPos, size_t tokenLen, std::string_view str, Match & match, size_t pos, int level)

Internal matching routine.

Parameters
tokenPos

Offset into the token stream.

tokenLen

The length of the token stream.

str

The input string to match against.

match

The object used to store the matching results.

pos

The position in the input string to start with matching

level

Recursion level (used for debugging)

Definition at line 181 of file regex.cpp.

448bool Ex::Private::matchAt(size_t tokenPos,size_t tokenLen,std::string_view str,Match &match,const size_t pos,int level) const
449{
450 DBG("%d:matchAt(tokenPos=%zu, str='%s', pos=%zu)\n",level,tokenPos,pos<str.length() ? str.substr(pos).c_str() : "",pos);
451 auto isStartIdChar = [](char c) { return isalpha(c) || c=='_'; };
452 auto isIdChar = [](char c) { return isalnum(c) || c=='_'; };
453 auto matchCharClass = [this,isStartIdChar,isIdChar](size_t tp,char c) -> bool
454 {
455 PToken tok = data[tp];
456 bool negate = tok.kind()==PToken::Kind::NegCharClass;
457 uint16_t numFields = tok.value();
458 bool found = false;
459 for (uint16_t i=0;i<numFields;i++)
460 {
461 tok = data[++tp];
462 // first check for built-in ranges
463 if ((tok.kind()==PToken::Kind::Alpha && isStartIdChar(c)) ||
464 (tok.kind()==PToken::Kind::AlphaNum && isIdChar(c)) ||
465 (tok.kind()==PToken::Kind::WhiteSpace && isspace(c)) ||
466 (tok.kind()==PToken::Kind::Digit && isdigit(c))
467 )
468 {
469 found=true;
470 break;
471 }
472 else // user specified range
473 {
474 uint16_t v = static_cast<uint16_t>(c);
475 if (tok.from()<=v && v<=tok.to())
476 {
477 found=true;
478 break;
479 }
480 }
481 }
482 DBG("matchCharClass(tp=%zu,c=%c (x%02x))=%d\n",tp,c,c,negate?!found:found);
483 return negate ? !found : found;
484 };
485 size_t index = pos;
486 enum SequenceType { Star, Optional, OptionalRange };
487 auto processSequence = [this,&tokenPos,&tokenLen,&index,&str,&matchCharClass,
488 &isStartIdChar,&isIdChar,&match,&level,&pos](SequenceType type) -> bool
489 {
490 size_t startIndex = index;
491 size_t len = str.length();
492 PToken tok = data[++tokenPos];
493 if (tok.kind()==PToken::Kind::Character) // 'x*' -> eat x's
494 {
495 char c_tok = tok.asciiValue();
496 while (index<len && str[index]==c_tok) { index++; if (type==Optional) break; }
497 tokenPos++;
498 }
499 else if (tok.isCharClass()) // '[a-f0-4]* -> eat matching characters
500 {
501 while (index<len && matchCharClass(tokenPos,str[index])) { index++; if (type==Optional) break; }
502 tokenPos+=tok.value()+1; // skip over character ranges + end token
503 }
504 else if (tok.kind()==PToken::Kind::Alpha) // '\a*' -> eat start id characters
505 {
506 while (index<len && isStartIdChar(str[index])) { index++; if (type==Optional) break; }
507 tokenPos++;
508 }
509 else if (tok.kind()==PToken::Kind::AlphaNum) // '\w*' -> eat id characters
510 {
511 while (index<len && isIdChar(str[index])) { index++; if (type==Optional) break; }
512 tokenPos++;
513 }
514 else if (tok.kind()==PToken::Kind::WhiteSpace) // '\s*' -> eat spaces
515 {
516 while (index<len && isspace(str[index])) { index++; if (type==Optional) break; }
517 tokenPos++;
518 }
519 else if (tok.kind()==PToken::Kind::Digit) // '\d*' -> eat digits
520 {
521 while (index<len && isdigit(str[index])) { index++; if (type==Optional) break; }
522 tokenPos++;
523 }
524 else if (tok.kind()==PToken::Kind::Any) // '.*' -> eat all
525 {
526 if (type==Optional) index++; else index = str.length();
527 tokenPos++;
528 }
529 else if (type==OptionalRange && tok.kind()==PToken::Kind::BeginCapture)
530 {
531 size_t tokenStart = ++tokenPos;
532 while (tokenPos<tokenLen && data[tokenPos].kind()!=PToken::Kind::EndCapture) { tokenPos++; }
533 Match rangeMatch;
534 rangeMatch.init(str);
535 bool found = matchAt(tokenStart,tokenPos,str,rangeMatch,index,level+1);
536 if (found)
537 {
538 index+=rangeMatch.length(); // (abc)? matches -> eat all
539 }
540 tokenPos++; // skip over EndCapture
541 }
542 tokenPos++; // skip over end marker
543 while (index>=startIndex)
544 {
545 // pattern 'x*xy' should match 'xy' and 'xxxxy'
546 bool found = matchAt(tokenPos,tokenLen,str,match,index,level+1);
547 if (found)
548 {
549 match.setMatch(pos,index-pos+match.length());
550 return true;
551 }
552 if (index==0) break;
553 index--;
554 }
555 return false;
556 };
557
558 while (tokenPos<tokenLen)
559 {
560 PToken tok = data[tokenPos];
561 DBG("loop tokenPos=%zu token=%s\n",tokenPos,tok.kindStr());
562 if (tok.kind()==PToken::Kind::Character) // match literal character
563 {
564 char c_tok = tok.asciiValue();
565 if (index>=str.length() || str[index]!=c_tok) return false; // end of string, or non matching char
566 index++,tokenPos++;
567 }
568 else if (tok.isCharClass())
569 {
570 if (index>=str.length() || !matchCharClass(tokenPos,str[index])) return false;
571 index++,tokenPos+=tok.value()+1; // skip over character ranges + end token
572 }
573 else
574 {
575 switch (tok.kind())
576 {
578 if (index>=str.length() || !isStartIdChar(str[index])) return false;
579 index++;
580 break;
582 if (index>=str.length() || !isIdChar(str[index])) return false;
583 index++;
584 break;
586 if (index>=str.length() || !isspace(str[index])) return false;
587 index++;
588 break;
590 if (index>=str.length() || !isdigit(str[index])) return false;
591 index++;
592 break;
594 if (index!=pos) return false;
595 break;
597 if (index<str.length()) return false;
598 break;
600 DBG("BeginOfWord: index=%zu isIdChar(%c)=%d prev.isIdChar(%c)=%d\n",
601 index,str[index],isIdChar(str[index]),
602 index>0?str[index]-1:0,
603 index>0?isIdChar(str[index-1]):-1);
604 if (index>=str.length() ||
605 !isIdChar(str[index]) ||
606 (index>0 && isIdChar(str[index-1]))) return false;
607 break;
609 DBG("EndOfWord: index=%zu pos=%zu idIdChar(%c)=%d prev.isIsChar(%c)=%d\n",
610 index,pos,str[index],isIdChar(str[index]),
611 index==0 ? 0 : str[index-1],
612 index==0 ? -1 : isIdChar(str[index-1]));
613 if (index<str.length() &&
614 (isIdChar(str[index]) || index==0 || !isIdChar(str[index-1]))) return false;
615 break;
617 DBG("BeginCapture(%zu)\n",index);
618 match.startCapture(index);
619 break;
621 DBG("EndCapture(%zu)\n",index);
622 match.endCapture(index);
623 break;
625 if (index>=str.length()) return false;
626 index++;
627 break;
629 return processSequence(Star);
631 if (tokenPos<tokenLen-1 && data[tokenPos+1].kind()==PToken::Kind::BeginCapture)
632 {
633 return processSequence(OptionalRange); // (...)?
634 }
635 else
636 {
637 return processSequence(Optional); // x?
638 }
639 default:
640 return false;
641 }
642 tokenPos++;
643 }
644 }
645 match.setMatch(pos,index-pos);
646 return true;
647}

References reg::PToken::Alpha, reg::PToken::AlphaNum, reg::PToken::Any, reg::PToken::asciiValue, reg::PToken::BeginCapture, reg::PToken::BeginOfLine, reg::PToken::BeginOfWord, reg::PToken::Character, data, DBG, reg::PToken::Digit, reg::PToken::EndCapture, reg::PToken::EndOfLine, reg::PToken::EndOfWord, reg::PToken::from, reg::Match::init, reg::isalnum, reg::isalpha, reg::PToken::isCharClass, reg::isdigit, isIdChar, reg::isspace, reg::PToken::kind, reg::PToken::kindStr, reg::Match::length, reg::Ex::match, matchAt, reg::PToken::NegCharClass, reg::PToken::Optional, reg::PToken::Star, reg::PToken::to, reg::PToken::value and reg::PToken::WhiteSpace.

Referenced by matchAt.

Public Member Attributes

data

std::vector<PToken> reg::Ex::Private::data

The token stream representing the compiled regular expression.

Definition at line 188 of file regex.cpp.

188 std::vector<PToken> data; // compiled pattern

Referenced by compile, matchAt and Private.

error

bool reg::Ex::Private::error = false

Flag indicating the expression was successfully compiled.

Definition at line 185 of file regex.cpp.

185 bool error = false;

Referenced by compile.

pattern

std::string reg::Ex::Private::pattern

The pattern string as passed by the user.

Definition at line 191 of file regex.cpp.

191 std::string pattern;

Referenced by compile and Private.


The documentation for this class was generated from the following file:


Generated via doxygen2docusaurus by Doxygen 1.14.0.