Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 18 additions & 12 deletions cpp/src/parquet/printer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -281,21 +281,27 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected
<< "\"StatsSet\": ";
if (column_chunk->is_stats_set()) {
stream << "\"True\", \"Stats\": {";
bool comma = false;
if (stats->HasNullCount()) {
stream << "\"NumNulls\": \"" << stats->null_count();
comma = true;
stream << "\"NumNulls\": \"" << stats->null_count() << "\"";
}
if (stats->HasDistinctCount()) {
stream << "\", "
<< "\"DistinctValues\": \"" << stats->distinct_count();
if (comma)
stream << ", ";
comma = true;
stream << "\"DistinctValues\": \"" << stats->distinct_count() << "\"";
}
if (stats->HasMinMax()) {
if (comma)
stream << ", ";
comma = true;
std::string min = stats->EncodeMin(), max = stats->EncodeMax();
stream << "\", "
<< "\"Max\": \"" << FormatStatValue(descr->physical_type(), max)
stream << "\"Max\": \"" << FormatStatValue(descr->physical_type(), max)
<< "\", "
<< "\"Min\": \"" << FormatStatValue(descr->physical_type(), min);
<< "\"Min\": \"" << FormatStatValue(descr->physical_type(), min) << "\"";
}
stream << "\" },";
stream << " },";
} else {
stream << "\"False\",";
}
Expand All @@ -312,11 +318,11 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected
}
stream << "\", "
<< "\"UncompressedSize\": \"" << column_chunk->total_uncompressed_size()
<< "\", \"CompressedSize\": \"" << column_chunk->total_compressed_size();
<< "\", \"CompressedSize\": \"" << column_chunk->total_compressed_size() << "\"";

if (column_chunk->bloom_filter_offset()) {
// Output BloomFilter {offset, length}
stream << "\", BloomFilter {"
stream << ", \"BloomFilter\": {"
<< "\"offset\": \"" << column_chunk->bloom_filter_offset().value();
if (column_chunk->bloom_filter_length()) {
stream << "\", \"length\": \"" << column_chunk->bloom_filter_length().value();
Expand All @@ -327,7 +333,7 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected
if (column_chunk->GetColumnIndexLocation()) {
auto location = column_chunk->GetColumnIndexLocation().value();
// Output ColumnIndex {offset, length}
stream << "\", ColumnIndex {"
stream << ", \"ColumnIndex\": {"
<< "\"offset\": \"" << location.offset;
stream << "\", \"length\": \"" << location.length;
stream << "\"}";
Expand All @@ -336,14 +342,14 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected
if (column_chunk->GetOffsetIndexLocation()) {
auto location = column_chunk->GetOffsetIndexLocation().value();
// Output OffsetIndex {offset, length}
stream << "\", OffsetIndex {"
stream << ", \"OffsetIndex\": {"
<< "\"offset\": \"" << location.offset;
stream << "\", \"length\": \"" << location.length;
stream << "\"}";
}

// end of a ColumnChunk
stream << "\" }";
stream << " }";
c1++;
if (c1 != static_cast<int>(selected_columns.size())) {
stream << ",\n";
Expand Down
22 changes: 16 additions & 6 deletions cpp/src/parquet/types.cc
Original file line number Diff line number Diff line change
Expand Up @@ -115,12 +115,22 @@ std::string FormatStatValue(Type::type parquet_type, ::std::string_view val) {
result << i32_val[0] << " " << i32_val[1] << " " << i32_val[2];
break;
}
case Type::BYTE_ARRAY: {
return std::string(val);
}
case Type::FIXED_LEN_BYTE_ARRAY: {
return std::string(val);
}
case Type::BYTE_ARRAY:
case Type::FIXED_LEN_BYTE_ARRAY:
// Escape byte arrays to be usable in json strings.
for (char c : val) {
if (c == '\\' || c == '"')
result << '\\' << c;
else if (c >= 32 && c <= 126)
result << c;
else
// What to do if the byte array is not valid utf8?
// There doesn't seem to be a standard way to reversibly convert byte strings to valid utf8 while keeping simple ascii readable.
// For now we'll just output invalid utf8, which json parsers may be ok with. Idk if that's the best option.
result << c;
// result << "�";
}
break;
case Type::UNDEFINED:
default:
break;
Expand Down
Loading