`Private` Class

Private members of a regular expression. More...

Declaration

class reg::Ex::Private { ... }

Public Constructors Index

	Private (std::string_view pat)
	Creates the private part. More...

Public Member Functions Index

void	compile ()
	Compiles a regular expression passed as a string into a stream of tokens that can be used for efficient searching. More...

bool	matchAt (size_t tokenPos, size_t tokenLen, std::string_view str, Match &match, size_t pos, int level) const
	Internal matching routine. More...

Public Member Attributes Index

bool	error = false
	Flag indicating the expression was successfully compiled. More...

std::vector< PToken >	data
	The token stream representing the compiled regular expression. More...

std::string	pattern
	The pattern string as passed by the user. More...

Description

Private members of a regular expression.

Definition at line 169 of file regex.cpp.

Public Constructors

Private()

reg::Ex::Private::Private (std::string_view pat)

inline

Creates the private part.

Definition at line 173 of file regex.cpp.

173 Private(std::string_view pat) : pattern(pat)

174 {

175 data.reserve(100);

176 }

References data and pattern.

Public Member Functions

compile()

void reg::Ex::Private::compile ()

Compiles a regular expression passed as a string into a stream of tokens that can be used for efficient searching.

Definition at line 177 of file regex.cpp.

197void Ex::Private::compile()

198{

199 error = false;

200 data.clear();

201 if (pattern.empty()) return;

202 const char *start = pattern.c_str();

203 const char *ps = start;

204 char c = 0;

205

206 int prevTokenPos=-1;

207 int tokenPos=0;

208

209 auto addToken = [&](PToken tok)

210 {

211 tokenPos++;

212 data.emplace_back(tok);

213 };

214

215 auto getNextCharacter = [&]() -> PToken

216 {

217 char cs=*ps;

218 PToken result = PToken(cs);

219 if (cs=='\\') // escaped character

220 {

221 ps++;

222 cs=*ps;

223 switch (cs)

224 {

225 case 'n': result = PToken('\n'); break;

226 case 'r': result = PToken('\r'); break;

227 case 't': result = PToken('\t'); break;

228 case 's': result = PToken(PToken::Kind::WhiteSpace); break;

229 case 'a': result = PToken(PToken::Kind::Alpha); break;

230 case 'w': result = PToken(PToken::Kind::AlphaNum); break;

231 case 'd': result = PToken(PToken::Kind::Digit); break;

232 case '<': result = PToken(PToken::Kind::BeginOfWord); break;

233 case '>': result = PToken(PToken::Kind::EndOfWord); break;

234 case 'x':

235 case 'X':

236 {

237 uint16_t v=0;

238 for (int i=0;i<2 && (cs=(*(ps+1)));i++) // 2 hex digits

239 {

240 int d = (cs>='a' && cs<='f') ? cs-'a'10 :

241 (cs>='A' && cs<='F') ? cs-'A'10 :

242 (cs>='0' && cs<='9') ? cs-'0' :

243 -1;

244 if (d>=0) { v<<=4; v|=d; ps++; } else break;

245 }

246 result = PToken(v);

247 }

248 break;

249 case '\0': ps--; break; // backslash at the end of the pattern

250 default:

251 result = PToken(cs);

252 break;

253 }

254 }

255 return result;

256 };

257

258 while ((c=*ps))

259 {

260 switch (c)

261 {

262 case '^': // beginning of line (if first character of the pattern)

263 prevTokenPos = tokenPos;

264 addToken(ps==start ? PToken(PToken::Kind::BeginOfLine) :

265 PToken(c));

266 break;

267 case '$': // end of the line (if last character of the pattern)

268 prevTokenPos = tokenPos;

269 addToken(*(ps+1)=='\0' ? PToken(PToken::Kind::EndOfLine) :

270 PToken(c));

271 break;

272 case '.': // any character

273 prevTokenPos = tokenPos;

274 addToken(PToken(PToken::Kind::Any));

275 break;

276 case '(': // begin of capture group

277 prevTokenPos = tokenPos;

278 addToken(PToken(PToken::Kind::BeginCapture));

279 break;

280 case ')': // end of capture group

281 prevTokenPos = tokenPos;

282 addToken(PToken(PToken::Kind::EndCapture));

283 break;

284 case '[': // character class

285 {

286 prevTokenPos = tokenPos;

287 ps++;

288 if (*ps==0) { error=true; return; }

289 bool esc = *ps=='\\';

290 PToken tok = getNextCharacter();

291 ps++;

292 if (!esc && tok.kind()==PToken::Kind::Character &&

293 tok.asciiValue()=='^') // negated character class

294 {

295 addToken(PToken(PToken::Kind::NegCharClass));

296 if (*ps==0) { error=true; return; }

297 tok = getNextCharacter();

298 ps++;

299 }

300 else

301 {

302 addToken(PToken(PToken::Kind::CharClass));

303 }

304 uint16_t numTokens=0;

305 while ((c=*ps))

306 {

307 if (c=='-' && *(ps+1)!=']' && *(ps+1)!=0) // range

308 {

309 getNextCharacter();

310 ps++;

311 PToken endTok = getNextCharacter();

312 ps++;

313 if (tok.value()>endTok.value())

314 {

315 addToken(PToken(endTok.value(),tok.value())); // swap start and end

316 }

317 else

318 {

319 addToken(PToken(tok.value(),endTok.value()));

320 }

321 numTokens++;

322 }

323 else // single char, from==to

324 {

325 if (tok.kind()==PToken::Kind::Character)

326 {

327 addToken(PToken(tok.value(),tok.value()));

328 }

329 else // special token, add as-is since from>to

330 {

331 addToken(tok);

332 }

333 numTokens++;

334 }

335 if (*ps==0) { error=true; return; } // expected at least a ]

336 esc = *ps=='\\';

337 tok = getNextCharacter();

338 if (!esc && tok.kind()==PToken::Kind::Character &&

339 tok.value()==static_cast<uint16_t>(']'))

340 {

341 break; // end of character class

342 }

343 if (*ps==0) { error=true; return; } // no ] found

344 ps++;

345 }

346 // set the value of either NegCharClass or CharClass

347 data[prevTokenPos].setValue(numTokens);

348 }

349 break;

350 case '*': // 0 or more

351 case '+': // 1 or more

352 case '?': // optional: 0 or 1

353 {

354 if (prevTokenPos==-1)

355 {

356 error=true;

357 return;

358 }

359 switch (data[prevTokenPos].kind())

360 {

361 case PToken::Kind::BeginOfLine: // $* or $+ or $?

362 case PToken::Kind::BeginOfWord: // \<* or \<+ or \<?

363 case PToken::Kind::EndOfWord: // \>* or \>+ or \>?

364 case PToken::Kind::Star: // ** or *+ or *?

365 case PToken::Kind::Optional: // ?* or ?+ or ??

366 error=true;

367 return;

368 default: // ok

369 break;

370 }

371 int ddiff = static_cast<int>(tokenPos-prevTokenPos);

372 if (*ps=='+') // convert <pat>+ -> <pat><pat>*

373 {

374 // turn a sequence of token [T1...Tn] followed by '+' into [T1..Tn T1..Tn T*]

375 // ddiff=n ^prevTokenPos

376 data.resize(data.size()+ddiff);

377 std::copy_n(data.begin()+prevTokenPos,ddiff,data.begin()+tokenPos);

378 prevTokenPos+=ddiff;

379 tokenPos+=ddiff;

380 }

381 if (data[prevTokenPos].kind()==PToken::Kind::EndCapture)

382 {

383 // find the beginning of the capture range

384 while (prevTokenPos>0 && data[prevTokenPos].kind()!=PToken::Kind::BeginCapture)

385 {

386 prevTokenPos--;

387 }

388 }

389 data.insert(data.begin()+prevTokenPos,

390 c=='?' ? PToken(PToken::Kind::Optional) : PToken(PToken::Kind::Star));

391 tokenPos++;

392 addToken(PToken(PToken::Kind::End));

393 // turn a sequence of tokens [T1 T2 T3] followed by 'T*' or into [T* T1 T2 T3 TEND]

394 // ^prevTokenPos

395 // same for 'T?'.

396 }

397 break;

398 default:

399 prevTokenPos = tokenPos;

400 addToken(getNextCharacter());

401 break;

402 }

403 ps++;

404 }

405 //addToken(PToken(PToken::Kind::End));

406}

References reg::PToken::Alpha, reg::PToken::AlphaNum, reg::PToken::Any, reg::PToken::asciiValue, reg::PToken::BeginCapture, reg::PToken::BeginOfLine, reg::PToken::BeginOfWord, reg::PToken::Character, reg::PToken::CharClass, data, reg::PToken::Digit, reg::PToken::End, reg::PToken::EndCapture, reg::PToken::EndOfLine, reg::PToken::EndOfWord, error, reg::PToken::kind, reg::PToken::NegCharClass, reg::PToken::Optional, pattern, reg::PToken::Star, reg::PToken::value and reg::PToken::WhiteSpace.

matchAt()

bool reg::Ex::Private::matchAt (size_t tokenPos, size_t tokenLen, std::string_view str, Match & match, size_t pos, int level)

Internal matching routine.

Parameters

tokenPos	Offset into the token stream.
tokenLen	The length of the token stream.
str	The input string to match against.
match	The object used to store the matching results.
pos	The position in the input string to start with matching
level	Recursion level (used for debugging)

Definition at line 181 of file regex.cpp.

448bool Ex::Private::matchAt(size_t tokenPos,size_t tokenLen,std::string_view str,Match &match,const size_t pos,int level) const

449{

450 DBG("%d:matchAt(tokenPos=%zu, str='%s', pos=%zu)\n",level,tokenPos,pos<str.length() ? str.substr(pos).c_str() : "",pos);

451 auto isStartIdChar = [](char c) { return isalpha(c) || c=='_'; };

452 auto isIdChar = [](char c) { return isalnum(c) || c=='_'; };

453 auto matchCharClass = [this,isStartIdChar,isIdChar](size_t tp,char c) -> bool

454 {

455 PToken tok = data[tp];

456 bool negate = tok.kind()==PToken::Kind::NegCharClass;

457 uint16_t numFields = tok.value();

458 bool found = false;

459 for (uint16_t i=0;i<numFields;i++)

460 {

461 tok = data[++tp];

462 // first check for built-in ranges

463 if ((tok.kind()==PToken::Kind::Alpha && isStartIdChar(c)) ||

464 (tok.kind()==PToken::Kind::AlphaNum && isIdChar(c)) ||

465 (tok.kind()==PToken::Kind::WhiteSpace && isspace(c)) ||

466 (tok.kind()==PToken::Kind::Digit && isdigit(c))

467 )

468 {

469 found=true;

470 break;

471 }

472 else // user specified range

473 {

474 uint16_t v = static_cast<uint16_t>(c);

475 if (tok.from()<=v && v<=tok.to())

476 {

477 found=true;

478 break;

479 }

480 }

481 }

482 DBG("matchCharClass(tp=%zu,c=%c (x%02x))=%d\n",tp,c,c,negate?!found:found);

483 return negate ? !found : found;

484 };

485 size_t index = pos;

486 enum SequenceType { Star, Optional, OptionalRange };

487 auto processSequence = [this,&tokenPos,&tokenLen,&index,&str,&matchCharClass,

488 &isStartIdChar,&isIdChar,&match,&level,&pos](SequenceType type) -> bool

489 {

490 size_t startIndex = index;

491 size_t len = str.length();

492 PToken tok = data[++tokenPos];

493 if (tok.kind()==PToken::Kind::Character) // 'x*' -> eat x's

494 {

495 char c_tok = tok.asciiValue();

496 while (index<len && str[index]==c_tok) { index++; if (type==Optional) break; }

497 tokenPos++;

498 }

499 else if (tok.isCharClass()) // '[a-f0-4]* -> eat matching characters

500 {

501 while (index<len && matchCharClass(tokenPos,str[index])) { index++; if (type==Optional) break; }

502 tokenPos+=tok.value()+1; // skip over character ranges + end token

503 }

504 else if (tok.kind()==PToken::Kind::Alpha) // '\a*' -> eat start id characters

505 {

506 while (index<len && isStartIdChar(str[index])) { index++; if (type==Optional) break; }

507 tokenPos++;

508 }

509 else if (tok.kind()==PToken::Kind::AlphaNum) // '\w*' -> eat id characters

510 {

511 while (index<len && isIdChar(str[index])) { index++; if (type==Optional) break; }

512 tokenPos++;

513 }

514 else if (tok.kind()==PToken::Kind::WhiteSpace) // '\s*' -> eat spaces

515 {

516 while (index<len && isspace(str[index])) { index++; if (type==Optional) break; }

517 tokenPos++;

518 }

519 else if (tok.kind()==PToken::Kind::Digit) // '\d*' -> eat digits

520 {

521 while (index<len && isdigit(str[index])) { index++; if (type==Optional) break; }

522 tokenPos++;

523 }

524 else if (tok.kind()==PToken::Kind::Any) // '.*' -> eat all

525 {

526 if (type==Optional) index++; else index = str.length();

527 tokenPos++;

528 }

529 else if (type==OptionalRange && tok.kind()==PToken::Kind::BeginCapture)

530 {

531 size_t tokenStart = ++tokenPos;

532 while (tokenPos<tokenLen && data[tokenPos].kind()!=PToken::Kind::EndCapture) { tokenPos++; }

533 Match rangeMatch;

534 rangeMatch.init(str);

535 bool found = matchAt(tokenStart,tokenPos,str,rangeMatch,index,level+1);

536 if (found)

537 {

538 index+=rangeMatch.length(); // (abc)? matches -> eat all

539 }

540 tokenPos++; // skip over EndCapture

541 }

542 tokenPos++; // skip over end marker

543 while (index>=startIndex)

544 {

545 // pattern 'x*xy' should match 'xy' and 'xxxxy'

546 bool found = matchAt(tokenPos,tokenLen,str,match,index,level+1);

547 if (found)

548 {

549 match.setMatch(pos,index-pos+match.length());

550 return true;

551 }

552 if (index==0) break;

553 index--;

554 }

555 return false;

556 };

557

558 while (tokenPos<tokenLen)

559 {

560 PToken tok = data[tokenPos];

561 DBG("loop tokenPos=%zu token=%s\n",tokenPos,tok.kindStr());

562 if (tok.kind()==PToken::Kind::Character) // match literal character

563 {

564 char c_tok = tok.asciiValue();

565 if (index>=str.length() || str[index]!=c_tok) return false; // end of string, or non matching char

566 index++,tokenPos++;

567 }

568 else if (tok.isCharClass())

569 {

570 if (index>=str.length() || !matchCharClass(tokenPos,str[index])) return false;

571 index++,tokenPos+=tok.value()+1; // skip over character ranges + end token

572 }

573 else

574 {

575 switch (tok.kind())

576 {

577 case PToken::Kind::Alpha:

578 if (index>=str.length() || !isStartIdChar(str[index])) return false;

579 index++;

580 break;

581 case PToken::Kind::AlphaNum:

582 if (index>=str.length() || !isIdChar(str[index])) return false;

583 index++;

584 break;

585 case PToken::Kind::WhiteSpace:

586 if (index>=str.length() || !isspace(str[index])) return false;

587 index++;

588 break;

589 case PToken::Kind::Digit:

590 if (index>=str.length() || !isdigit(str[index])) return false;

591 index++;

592 break;

593 case PToken::Kind::BeginOfLine:

594 if (index!=pos) return false;

595 break;

596 case PToken::Kind::EndOfLine:

597 if (index<str.length()) return false;

598 break;

599 case PToken::Kind::BeginOfWord:

600 DBG("BeginOfWord: index=%zu isIdChar(%c)=%d prev.isIdChar(%c)=%d\n",

601 index,str[index],isIdChar(str[index]),

602 index>0?str[index]-1:0,

603 index>0?isIdChar(str[index-1]):-1);

604 if (index>=str.length() ||

605 !isIdChar(str[index]) ||

606 (index>0 && isIdChar(str[index-1]))) return false;

607 break;

608 case PToken::Kind::EndOfWord:

609 DBG("EndOfWord: index=%zu pos=%zu idIdChar(%c)=%d prev.isIsChar(%c)=%d\n",

610 index,pos,str[index],isIdChar(str[index]),

611 index==0 ? 0 : str[index-1],

612 index==0 ? -1 : isIdChar(str[index-1]));

613 if (index<str.length() &&

614 (isIdChar(str[index]) || index==0 || !isIdChar(str[index-1]))) return false;

615 break;

616 case PToken::Kind::BeginCapture:

617 DBG("BeginCapture(%zu)\n",index);

618 match.startCapture(index);

619 break;

620 case PToken::Kind::EndCapture:

621 DBG("EndCapture(%zu)\n",index);

622 match.endCapture(index);

623 break;

624 case PToken::Kind::Any:

625 if (index>=str.length()) return false;

626 index++;

627 break;

628 case PToken::Kind::Star:

629 return processSequence(Star);

630 case PToken::Kind::Optional:

631 if (tokenPos<tokenLen-1 && data[tokenPos+1].kind()==PToken::Kind::BeginCapture)

632 {

633 return processSequence(OptionalRange); // (...)?

634 }

635 else

636 {

637 return processSequence(Optional); // x?

638 }

639 default:

640 return false;

641 }

642 tokenPos++;

643 }

644 }

645 match.setMatch(pos,index-pos);

646 return true;

647}

References reg::PToken::Alpha, reg::PToken::AlphaNum, reg::PToken::Any, reg::PToken::asciiValue, reg::PToken::BeginCapture, reg::PToken::BeginOfLine, reg::PToken::BeginOfWord, reg::PToken::Character, data, DBG, reg::PToken::Digit, reg::PToken::EndCapture, reg::PToken::EndOfLine, reg::PToken::EndOfWord, reg::PToken::from, reg::Match::init, reg::isalnum, reg::isalpha, reg::PToken::isCharClass, reg::isdigit, isIdChar, reg::isspace, reg::PToken::kind, reg::PToken::kindStr, reg::Match::length, reg::Ex::match, matchAt, reg::PToken::NegCharClass, reg::PToken::Optional, reg::PToken::Star, reg::PToken::to, reg::PToken::value and reg::PToken::WhiteSpace.

Referenced by matchAt.

Public Member Attributes

data

std::vector<PToken> reg::Ex::Private::data

The token stream representing the compiled regular expression.

Definition at line 188 of file regex.cpp.

188 std::vector<PToken> data; // compiled pattern

Referenced by compile, matchAt and Private.

error

bool reg::Ex::Private::error = false

Flag indicating the expression was successfully compiled.

Definition at line 185 of file regex.cpp.

185 bool error = false;

Referenced by compile.

pattern

std::string reg::Ex::Private::pattern

The pattern string as passed by the user.

Definition at line 191 of file regex.cpp.

191 std::string pattern;

Referenced by compile and Private.

The documentation for this class was generated from the following file:

regex.cpp

Generated via doxygen2docusaurus 2.0.0 by Doxygen 1.14.0.

Declaration​

Public Constructors Index​

Public Member Functions Index​

Public Member Attributes Index​

Description​

Public Constructors​

Private()​

Public Member Functions​

compile()​

matchAt()​

Public Member Attributes​

data​

error​

pattern​