//===-- Regex.cpp - Regular Expression matcher implementation -------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements a POSIX regular expression matcher. // //===----------------------------------------------------------------------===// #include "llvm/Support/Regex.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" #include "regex_impl.h" #include #include using namespace llvm; Regex::Regex() : preg(nullptr), error(REG_BADPAT) {} Regex::Regex(StringRef regex, RegexFlags Flags) { unsigned flags = 0; preg = new llvm_regex(); preg->re_endp = regex.end(); if (Flags & IgnoreCase) flags |= REG_ICASE; if (Flags & Newline) flags |= REG_NEWLINE; if (!(Flags & BasicRegex)) flags |= REG_EXTENDED; error = llvm_regcomp(preg, regex.data(), flags|REG_PEND); } Regex::Regex(StringRef regex, unsigned Flags) : Regex(regex, static_cast(Flags)) {} Regex::Regex(Regex &®ex) { preg = regex.preg; error = regex.error; regex.preg = nullptr; regex.error = REG_BADPAT; } Regex::~Regex() { if (preg) { llvm_regfree(preg); delete preg; } } namespace { /// Utility to convert a regex error code into a human-readable string. void RegexErrorToString(int error, struct llvm_regex *preg, std::string &Error) { size_t len = llvm_regerror(error, preg, nullptr, 0); Error.resize(len - 1); llvm_regerror(error, preg, &Error[0], len); } } // namespace bool Regex::isValid(std::string &Error) const { if (!error) return true; RegexErrorToString(error, preg, Error); return false; } /// getNumMatches - In a valid regex, return the number of parenthesized /// matches it contains. unsigned Regex::getNumMatches() const { return preg->re_nsub; } bool Regex::match(StringRef String, SmallVectorImpl *Matches, std::string *Error) const { // Reset error, if given. if (Error && !Error->empty()) *Error = ""; // Check if the regex itself didn't successfully compile. if (Error ? !isValid(*Error) : !isValid()) return false; unsigned nmatch = Matches ? preg->re_nsub+1 : 0; // Update null string to empty string. if (String.data() == nullptr) String = ""; // pmatch needs to have at least one element. SmallVector pm; pm.resize(nmatch > 0 ? nmatch : 1); pm[0].rm_so = 0; pm[0].rm_eo = String.size(); int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND); // Failure to match is not an error, it's just a normal return value. // Any other error code is considered abnormal, and is logged in the Error. if (rc == REG_NOMATCH) return false; if (rc != 0) { if (Error) RegexErrorToString(error, preg, *Error); return false; } // There was a match. if (Matches) { // match position requested Matches->clear(); for (unsigned i = 0; i != nmatch; ++i) { if (pm[i].rm_so == -1) { // this group didn't match Matches->push_back(StringRef()); continue; } assert(pm[i].rm_eo >= pm[i].rm_so); Matches->push_back(StringRef(String.data()+pm[i].rm_so, pm[i].rm_eo-pm[i].rm_so)); } } return true; } std::string Regex::sub(StringRef Repl, StringRef String, std::string *Error) const { SmallVector Matches; // Return the input if there was no match. if (!match(String, &Matches, Error)) return std::string(String); // Otherwise splice in the replacement string, starting with the prefix before // the match. std::string Res(String.begin(), Matches[0].begin()); // Then the replacement string, honoring possible substitutions. while (!Repl.empty()) { // Skip to the next escape. std::pair Split = Repl.split('\\'); // Add the skipped substring. Res += Split.first; // Check for terminimation and trailing backslash. if (Split.second.empty()) { if (Repl.size() != Split.first.size() && Error && Error->empty()) *Error = "replacement string contained trailing backslash"; break; } // Otherwise update the replacement string and interpret escapes. Repl = Split.second; // FIXME: We should have a StringExtras function for mapping C99 escapes. switch (Repl[0]) { // Backreference with the "\g" syntax case 'g': if (Repl.size() >= 4 && Repl[1] == '<') { size_t End = Repl.find('>'); StringRef Ref = Repl.slice(2, End); unsigned RefValue; if (End != StringRef::npos && !Ref.getAsInteger(10, RefValue)) { Repl = Repl.substr(End + 1); if (RefValue < Matches.size()) Res += Matches[RefValue]; else if (Error && Error->empty()) *Error = ("invalid backreference string 'g<" + Twine(Ref) + ">'").str(); break; } } [[fallthrough]]; // Treat all unrecognized characters as self-quoting. default: Res += Repl[0]; Repl = Repl.substr(1); break; // Single character escapes. case 't': Res += '\t'; Repl = Repl.substr(1); break; case 'n': Res += '\n'; Repl = Repl.substr(1); break; // Decimal escapes are backreferences. case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { // Extract the backreference number. StringRef Ref = Repl.slice(0, Repl.find_first_not_of("0123456789")); Repl = Repl.substr(Ref.size()); unsigned RefValue; if (!Ref.getAsInteger(10, RefValue) && RefValue < Matches.size()) Res += Matches[RefValue]; else if (Error && Error->empty()) *Error = ("invalid backreference string '" + Twine(Ref) + "'").str(); break; } } } // And finally the suffix. Res += StringRef(Matches[0].end(), String.end() - Matches[0].end()); return Res; } // These are the special characters matched in functions like "p_ere_exp". static const char RegexMetachars[] = "()^$|*+?.[]\\{}"; bool Regex::isLiteralERE(StringRef Str) { // Check for regex metacharacters. This list was derived from our regex // implementation in regcomp.c and double checked against the POSIX extended // regular expression specification. return Str.find_first_of(RegexMetachars) == StringRef::npos; } std::string Regex::escape(StringRef String) { std::string RegexStr; for (char C : String) { if (strchr(RegexMetachars, C)) RegexStr += '\\'; RegexStr += C; } return RegexStr; }