//===-- StringPrinter.cpp -------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "lldb/DataFormatters/StringPrinter.h" #include "lldb/Core/Debugger.h" #include "lldb/Core/ValueObject.h" #include "lldb/Target/Language.h" #include "lldb/Target/Process.h" #include "lldb/Target/Target.h" #include "lldb/Utility/Status.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/ConvertUTF.h" #include #include #include using namespace lldb; using namespace lldb_private; using namespace lldb_private::formatters; using GetPrintableElementType = StringPrinter::GetPrintableElementType; using StringElementType = StringPrinter::StringElementType; /// DecodedCharBuffer stores the decoded contents of a single character. It /// avoids managing memory on the heap by copying decoded bytes into an in-line /// buffer. class DecodedCharBuffer { public: DecodedCharBuffer(std::nullptr_t) {} DecodedCharBuffer(const uint8_t *bytes, size_t size) : m_size(size) { if (size > MaxLength) llvm_unreachable("unsupported length"); memcpy(m_data, bytes, size); } DecodedCharBuffer(const char *bytes, size_t size) : DecodedCharBuffer(reinterpret_cast(bytes), size) {} const uint8_t *GetBytes() const { return m_data; } size_t GetSize() const { return m_size; } private: static constexpr unsigned MaxLength = 16; size_t m_size = 0; uint8_t m_data[MaxLength] = {0}; }; using EscapingHelper = std::function; // we define this for all values of type but only implement it for those we // care about that's good because we get linker errors for any unsupported type template static DecodedCharBuffer GetPrintableImpl(uint8_t *buffer, uint8_t *buffer_end, uint8_t *&next, StringPrinter::EscapeStyle escape_style); // Mimic isprint() for Unicode codepoints. static bool isprint32(char32_t codepoint) { if (codepoint <= 0x1F || codepoint == 0x7F) // C0 { return false; } if (codepoint >= 0x80 && codepoint <= 0x9F) // C1 { return false; } if (codepoint == 0x2028 || codepoint == 0x2029) // line/paragraph separators { return false; } if (codepoint == 0x200E || codepoint == 0x200F || (codepoint >= 0x202A && codepoint <= 0x202E)) // bidirectional text control { return false; } if (codepoint >= 0xFFF9 && codepoint <= 0xFFFF) // interlinears and generally specials { return false; } return true; } DecodedCharBuffer attemptASCIIEscape(llvm::UTF32 c, StringPrinter::EscapeStyle escape_style) { const bool is_swift_escape_style = escape_style == StringPrinter::EscapeStyle::Swift; switch (c) { case 0: return {"\\0", 2}; case '\a': return {"\\a", 2}; case '\b': if (is_swift_escape_style) return nullptr; return {"\\b", 2}; case '\f': if (is_swift_escape_style) return nullptr; return {"\\f", 2}; case '\n': return {"\\n", 2}; case '\r': return {"\\r", 2}; case '\t': return {"\\t", 2}; case '\v': if (is_swift_escape_style) return nullptr; return {"\\v", 2}; case '\"': return {"\\\"", 2}; case '\'': if (is_swift_escape_style) return {"\\'", 2}; return nullptr; case '\\': return {"\\\\", 2}; } return nullptr; } template <> DecodedCharBuffer GetPrintableImpl( uint8_t *buffer, uint8_t *buffer_end, uint8_t *&next, StringPrinter::EscapeStyle escape_style) { // The ASCII helper always advances 1 byte at a time. next = buffer + 1; DecodedCharBuffer retval = attemptASCIIEscape(*buffer, escape_style); if (retval.GetSize()) return retval; // Use llvm's locale-independent isPrint(char), instead of the libc // implementation which may give different results on different platforms. if (llvm::isPrint(*buffer)) return {buffer, 1}; unsigned escaped_len; constexpr unsigned max_buffer_size = 7; uint8_t data[max_buffer_size]; switch (escape_style) { case StringPrinter::EscapeStyle::CXX: // Prints 4 characters, then a \0 terminator. escaped_len = snprintf((char *)data, max_buffer_size, "\\x%02x", *buffer); break; case StringPrinter::EscapeStyle::Swift: // Prints up to 6 characters, then a \0 terminator. escaped_len = snprintf((char *)data, max_buffer_size, "\\u{%x}", *buffer); break; } lldbassert(escaped_len > 0 && "unknown string escape style"); return {data, escaped_len}; } template <> DecodedCharBuffer GetPrintableImpl( uint8_t *buffer, uint8_t *buffer_end, uint8_t *&next, StringPrinter::EscapeStyle escape_style) { // If the utf8 encoded length is invalid (i.e., not in the closed interval // [1;4]), or if there aren't enough bytes to print, or if the subsequence // isn't valid utf8, fall back to printing an ASCII-escaped subsequence. if (!llvm::isLegalUTF8Sequence(buffer, buffer_end)) return GetPrintableImpl(buffer, buffer_end, next, escape_style); // Convert the valid utf8 sequence to a utf32 codepoint. This cannot fail. llvm::UTF32 codepoint = 0; const llvm::UTF8 *buffer_for_conversion = buffer; llvm::ConversionResult result = llvm::convertUTF8Sequence( &buffer_for_conversion, buffer_end, &codepoint, llvm::strictConversion); assert(result == llvm::conversionOK && "Failed to convert legal utf8 sequence"); UNUSED_IF_ASSERT_DISABLED(result); // The UTF8 helper always advances by the utf8 encoded length. const unsigned utf8_encoded_len = buffer_for_conversion - buffer; next = buffer + utf8_encoded_len; DecodedCharBuffer retval = attemptASCIIEscape(codepoint, escape_style); if (retval.GetSize()) return retval; if (isprint32(codepoint)) return {buffer, utf8_encoded_len}; unsigned escaped_len; constexpr unsigned max_buffer_size = 13; uint8_t data[max_buffer_size]; switch (escape_style) { case StringPrinter::EscapeStyle::CXX: // Prints 10 characters, then a \0 terminator. escaped_len = snprintf((char *)data, max_buffer_size, "\\U%08x", codepoint); break; case StringPrinter::EscapeStyle::Swift: // Prints up to 12 characters, then a \0 terminator. escaped_len = snprintf((char *)data, max_buffer_size, "\\u{%x}", codepoint); break; } lldbassert(escaped_len > 0 && "unknown string escape style"); return {data, escaped_len}; } // Given a sequence of bytes, this function returns: a sequence of bytes to // actually print out + a length the following unscanned position of the buffer // is in next static DecodedCharBuffer GetPrintable(StringElementType type, uint8_t *buffer, uint8_t *buffer_end, uint8_t *&next, StringPrinter::EscapeStyle escape_style) { if (!buffer || buffer >= buffer_end) return {nullptr}; switch (type) { case StringElementType::ASCII: return GetPrintableImpl(buffer, buffer_end, next, escape_style); case StringElementType::UTF8: return GetPrintableImpl(buffer, buffer_end, next, escape_style); default: return {nullptr}; } } static EscapingHelper GetDefaultEscapingHelper(GetPrintableElementType elem_type, StringPrinter::EscapeStyle escape_style) { switch (elem_type) { case GetPrintableElementType::UTF8: case GetPrintableElementType::ASCII: return [escape_style, elem_type](uint8_t *buffer, uint8_t *buffer_end, uint8_t *&next) -> DecodedCharBuffer { return GetPrintable(elem_type == GetPrintableElementType::UTF8 ? StringElementType::UTF8 : StringElementType::ASCII, buffer, buffer_end, next, escape_style); }; } llvm_unreachable("bad element type"); } /// Read a string encoded in accordance with \tparam SourceDataType from a /// host-side LLDB buffer, then pretty-print it to a stream using \p style. template static bool DumpEncodedBufferToStream( GetPrintableElementType style, llvm::ConversionResult (*ConvertFunction)(const SourceDataType **, const SourceDataType *, llvm::UTF8 **, llvm::UTF8 *, llvm::ConversionFlags), const StringPrinter::ReadBufferAndDumpToStreamOptions &dump_options) { assert(dump_options.GetStream() && "need a Stream to print the string to"); Stream &stream(*dump_options.GetStream()); if (dump_options.GetPrefixToken() != nullptr) stream.Printf("%s", dump_options.GetPrefixToken()); if (dump_options.GetQuote() != 0) stream.Printf("%c", dump_options.GetQuote()); auto data(dump_options.GetData()); auto source_size(dump_options.GetSourceSize()); if (data.GetByteSize() && data.GetDataStart() && data.GetDataEnd()) { const int bufferSPSize = data.GetByteSize(); if (dump_options.GetSourceSize() == 0) { const int origin_encoding = 8 * sizeof(SourceDataType); source_size = bufferSPSize / (origin_encoding / 4); } const SourceDataType *data_ptr = (const SourceDataType *)data.GetDataStart(); const SourceDataType *data_end_ptr = data_ptr + source_size; const bool zero_is_terminator = dump_options.GetBinaryZeroIsTerminator(); if (zero_is_terminator) { while (data_ptr < data_end_ptr) { if (!*data_ptr) { data_end_ptr = data_ptr; break; } data_ptr++; } data_ptr = (const SourceDataType *)data.GetDataStart(); } lldb::WritableDataBufferSP utf8_data_buffer_sp; llvm::UTF8 *utf8_data_ptr = nullptr; llvm::UTF8 *utf8_data_end_ptr = nullptr; if (ConvertFunction) { utf8_data_buffer_sp = std::make_shared(4 * bufferSPSize, 0); utf8_data_ptr = (llvm::UTF8 *)utf8_data_buffer_sp->GetBytes(); utf8_data_end_ptr = utf8_data_ptr + utf8_data_buffer_sp->GetByteSize(); ConvertFunction(&data_ptr, data_end_ptr, &utf8_data_ptr, utf8_data_end_ptr, llvm::lenientConversion); if (!zero_is_terminator) utf8_data_end_ptr = utf8_data_ptr; // needed because the ConvertFunction will change the value of the // data_ptr. utf8_data_ptr = (llvm::UTF8 *)utf8_data_buffer_sp->GetBytes(); } else { // just copy the pointers - the cast is necessary to make the compiler // happy but this should only happen if we are reading UTF8 data utf8_data_ptr = const_cast( reinterpret_cast(data_ptr)); utf8_data_end_ptr = const_cast( reinterpret_cast(data_end_ptr)); } const bool escape_non_printables = dump_options.GetEscapeNonPrintables(); EscapingHelper escaping_callback; if (escape_non_printables) escaping_callback = GetDefaultEscapingHelper(style, dump_options.GetEscapeStyle()); // since we tend to accept partial data (and even partially malformed data) // we might end up with no NULL terminator before the end_ptr hence we need // to take a slower route and ensure we stay within boundaries for (; utf8_data_ptr < utf8_data_end_ptr;) { if (zero_is_terminator && !*utf8_data_ptr) break; if (escape_non_printables) { uint8_t *next_data = nullptr; auto printable = escaping_callback(utf8_data_ptr, utf8_data_end_ptr, next_data); auto printable_bytes = printable.GetBytes(); auto printable_size = printable.GetSize(); // We failed to figure out how to print this string. if (!printable_bytes || !next_data) return false; for (unsigned c = 0; c < printable_size; c++) stream.Printf("%c", *(printable_bytes + c)); utf8_data_ptr = (uint8_t *)next_data; } else { stream.Printf("%c", *utf8_data_ptr); utf8_data_ptr++; } } } if (dump_options.GetQuote() != 0) stream.Printf("%c", dump_options.GetQuote()); if (dump_options.GetSuffixToken() != nullptr) stream.Printf("%s", dump_options.GetSuffixToken()); if (dump_options.GetIsTruncated()) stream.Printf("..."); return true; } lldb_private::formatters::StringPrinter::ReadStringAndDumpToStreamOptions:: ReadStringAndDumpToStreamOptions(ValueObject &valobj) : ReadStringAndDumpToStreamOptions() { SetEscapeNonPrintables( valobj.GetTargetSP()->GetDebugger().GetEscapeNonPrintables()); } lldb_private::formatters::StringPrinter::ReadBufferAndDumpToStreamOptions:: ReadBufferAndDumpToStreamOptions(ValueObject &valobj) : ReadBufferAndDumpToStreamOptions() { SetEscapeNonPrintables( valobj.GetTargetSP()->GetDebugger().GetEscapeNonPrintables()); } lldb_private::formatters::StringPrinter::ReadBufferAndDumpToStreamOptions:: ReadBufferAndDumpToStreamOptions( const ReadStringAndDumpToStreamOptions &options) : ReadBufferAndDumpToStreamOptions() { SetStream(options.GetStream()); SetPrefixToken(options.GetPrefixToken()); SetSuffixToken(options.GetSuffixToken()); SetQuote(options.GetQuote()); SetEscapeNonPrintables(options.GetEscapeNonPrintables()); SetBinaryZeroIsTerminator(options.GetBinaryZeroIsTerminator()); SetEscapeStyle(options.GetEscapeStyle()); } namespace lldb_private { namespace formatters { template static bool ReadEncodedBufferAndDumpToStream( StringElementType elem_type, const StringPrinter::ReadStringAndDumpToStreamOptions &options, llvm::ConversionResult (*ConvertFunction)(const SourceDataType **, const SourceDataType *, llvm::UTF8 **, llvm::UTF8 *, llvm::ConversionFlags)) { assert(options.GetStream() && "need a Stream to print the string to"); if (!options.GetStream()) return false; if (options.GetLocation() == 0 || options.GetLocation() == LLDB_INVALID_ADDRESS) return false; lldb::TargetSP target_sp = options.GetTargetSP(); if (!target_sp) return false; constexpr int type_width = sizeof(SourceDataType); constexpr int origin_encoding = 8 * type_width; if (origin_encoding != 8 && origin_encoding != 16 && origin_encoding != 32) return false; // If not UTF8 or ASCII, conversion to UTF8 is necessary. if (origin_encoding != 8 && !ConvertFunction) return false; bool needs_zero_terminator = options.GetNeedsZeroTermination(); bool is_truncated = false; const auto max_size = target_sp->GetMaximumSizeOfStringSummary(); uint32_t sourceSize; if (elem_type == StringElementType::ASCII && !options.GetSourceSize()) { // FIXME: The NSString formatter sets HasSourceSize(true) when the size is // actually unknown, as well as SetBinaryZeroIsTerminator(false). IIUC the // C++ formatter also sets SetBinaryZeroIsTerminator(false) when it doesn't // mean to. I don't see how this makes sense: we should fix the formatters. // // Until then, the behavior that's expected for ASCII strings with unknown // lengths is to read up to the max size and then null-terminate. Do that. sourceSize = max_size; needs_zero_terminator = true; } else if (options.HasSourceSize()) { sourceSize = options.GetSourceSize(); if (!options.GetIgnoreMaxLength()) { if (sourceSize > max_size) { sourceSize = max_size; is_truncated = true; } } } else { sourceSize = max_size; needs_zero_terminator = true; } const int bufferSPSize = sourceSize * type_width; lldb::WritableDataBufferSP buffer_sp(new DataBufferHeap(bufferSPSize, 0)); // Check if we got bytes. We never get any bytes if we have an empty // string, but we still continue so that we end up actually printing // an empty string (""). if (sourceSize != 0 && !buffer_sp->GetBytes()) return false; Status error; char *buffer = reinterpret_cast(buffer_sp->GetBytes()); if (elem_type == StringElementType::ASCII) target_sp->ReadCStringFromMemory(options.GetLocation(), buffer, bufferSPSize, error); else if (needs_zero_terminator) target_sp->ReadStringFromMemory(options.GetLocation(), buffer, bufferSPSize, error, type_width); else target_sp->ReadMemory(options.GetLocation(), buffer, bufferSPSize, error); if (error.Fail()) { options.GetStream()->Printf("unable to read data"); return true; } StringPrinter::ReadBufferAndDumpToStreamOptions dump_options(options); dump_options.SetData( DataExtractor(buffer_sp, target_sp->GetArchitecture().GetByteOrder(), target_sp->GetArchitecture().GetAddressByteSize())); dump_options.SetSourceSize(sourceSize); dump_options.SetIsTruncated(is_truncated); dump_options.SetNeedsZeroTermination(needs_zero_terminator); if (needs_zero_terminator) dump_options.SetBinaryZeroIsTerminator(true); GetPrintableElementType print_style = (elem_type == StringElementType::ASCII) ? GetPrintableElementType::ASCII : GetPrintableElementType::UTF8; return DumpEncodedBufferToStream(print_style, ConvertFunction, dump_options); } template <> bool StringPrinter::ReadStringAndDumpToStream( const ReadStringAndDumpToStreamOptions &options) { return ReadEncodedBufferAndDumpToStream(StringElementType::UTF8, options, nullptr); } template <> bool StringPrinter::ReadStringAndDumpToStream( const ReadStringAndDumpToStreamOptions &options) { return ReadEncodedBufferAndDumpToStream( StringElementType::UTF16, options, llvm::ConvertUTF16toUTF8); } template <> bool StringPrinter::ReadStringAndDumpToStream( const ReadStringAndDumpToStreamOptions &options) { return ReadEncodedBufferAndDumpToStream( StringElementType::UTF32, options, llvm::ConvertUTF32toUTF8); } template <> bool StringPrinter::ReadStringAndDumpToStream( const ReadStringAndDumpToStreamOptions &options) { return ReadEncodedBufferAndDumpToStream(StringElementType::ASCII, options, nullptr); } template <> bool StringPrinter::ReadBufferAndDumpToStream( const ReadBufferAndDumpToStreamOptions &options) { return DumpEncodedBufferToStream(GetPrintableElementType::UTF8, nullptr, options); } template <> bool StringPrinter::ReadBufferAndDumpToStream( const ReadBufferAndDumpToStreamOptions &options) { return DumpEncodedBufferToStream(GetPrintableElementType::UTF8, llvm::ConvertUTF16toUTF8, options); } template <> bool StringPrinter::ReadBufferAndDumpToStream( const ReadBufferAndDumpToStreamOptions &options) { return DumpEncodedBufferToStream(GetPrintableElementType::UTF8, llvm::ConvertUTF32toUTF8, options); } template <> bool StringPrinter::ReadBufferAndDumpToStream( const ReadBufferAndDumpToStreamOptions &options) { // Treat ASCII the same as UTF8. // // FIXME: This is probably not the right thing to do (well, it's debatable). // If an ASCII-encoded string happens to contain a sequence of invalid bytes // that forms a valid UTF8 character, we'll print out that character. This is // good if you're playing fast and loose with encodings (probably good for // std::string users), but maybe not so good if you care about your string // formatter respecting the semantics of your selected string encoding. In // the latter case you'd want to see the character byte sequence ('\x..'), not // the UTF8 character itself. return ReadBufferAndDumpToStream(options); } } // namespace formatters } // namespace lldb_private