Skip to content

Commit

Permalink
Compare all values by bytes rather than adding Intern trait
Browse files Browse the repository at this point in the history
  • Loading branch information
adamreeve committed Jan 8, 2025
1 parent e342a38 commit ef980a7
Show file tree
Hide file tree
Showing 3 changed files with 3 additions and 73 deletions.
7 changes: 0 additions & 7 deletions parquet/src/arrow/arrow_writer/byte_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -326,13 +326,6 @@ impl Storage for ByteArrayStorage {
}
}

impl crate::util::interner::Intern for [u8] {
#[inline]
fn eq(&self, other: &Self) -> bool {
self == other
}
}

/// A dictionary encoder for byte array data
#[derive(Debug, Default)]
struct DictEncoder {
Expand Down
60 changes: 0 additions & 60 deletions parquet/src/data_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -611,7 +611,6 @@ pub(crate) mod private {
use super::{ParquetError, Result, SliceAsBytes};
use crate::basic::Type;
use crate::file::metadata::HeapSize;
use crate::util::interner::Intern;

/// Sealed trait to start to remove specialisation from implementations
///
Expand All @@ -632,7 +631,6 @@ pub(crate) mod private {
+ HeapSize
+ crate::encodings::decoding::private::GetDecoder
+ crate::file::statistics::private::MakeStatistics
+ Intern
{
const PHYSICAL_TYPE: Type;

Expand Down Expand Up @@ -1127,64 +1125,6 @@ pub(crate) mod private {
self.0.heap_size()
}
}

impl Intern for bool {
#[inline]
fn eq(&self, other: &Self) -> bool {
self == other
}
}

impl Intern for i32 {
#[inline]
fn eq(&self, other: &Self) -> bool {
self == other
}
}

impl Intern for i64 {
#[inline]
fn eq(&self, other: &Self) -> bool {
self == other
}
}

impl Intern for super::Int96 {
#[inline]
fn eq(&self, other: &Self) -> bool {
self == other
}
}

impl Intern for f32 {
#[inline]
fn eq(&self, other: &Self) -> bool {
// Treat NaN == NaN when interning values
self.total_cmp(other) == std::cmp::Ordering::Equal
}
}

impl Intern for f64 {
#[inline]
fn eq(&self, other: &Self) -> bool {
// Treat NaN == NaN when interning values
self.total_cmp(other) == std::cmp::Ordering::Equal
}
}

impl Intern for super::ByteArray {
#[inline]
fn eq(&self, other: &Self) -> bool {
self == other
}
}

impl Intern for super::FixedLenByteArray {
#[inline]
fn eq(&self, other: &Self) -> bool {
self == other
}
}
}

/// Contains the Parquet physical type information as well as the Rust primitive type
Expand Down
9 changes: 3 additions & 6 deletions parquet/src/util/interner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ const DEFAULT_DEDUP_CAPACITY: usize = 4096;
pub trait Storage {
type Key: Copy;

type Value: Intern + ?Sized;
type Value: AsBytes + ?Sized;

/// Gets an element by its key
fn get(&self, idx: Self::Key) -> &Self::Value;
Expand All @@ -37,10 +37,6 @@ pub trait Storage {
fn estimated_memory_size(&self) -> usize;
}

pub trait Intern: AsBytes {
fn eq(&self, other: &Self) -> bool;
}

/// A generic value interner supporting various different [`Storage`]
#[derive(Debug, Default)]
pub struct Interner<S: Storage> {
Expand Down Expand Up @@ -70,7 +66,8 @@ impl<S: Storage> Interner<S> {
.dedup
.entry(
hash,
|index| value.eq(self.storage.get(*index)),
// Compare bytes rather than directly comparing values so NaNs can be interned
|index| value.as_bytes() == self.storage.get(*index).as_bytes(),
|key| self.state.hash_one(self.storage.get(*key).as_bytes()),
)
.or_insert_with(|| self.storage.push(value))
Expand Down

0 comments on commit ef980a7

Please sign in to comment.