Skip to content

Commit

Permalink
fix: update data type of all numeric fields in storage schema to Float64
Browse files Browse the repository at this point in the history
  • Loading branch information
nikhilsinhaparseable committed Dec 10, 2024
1 parent cda3e2b commit 8550152
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 22 deletions.
8 changes: 8 additions & 0 deletions src/catalog/column.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,14 @@ impl TypedStatistics {
max: max(this.max, other.max),
})
}

// Ints are casted to Float if self is Float and other in Int
(TypedStatistics::Float(this), TypedStatistics::Int(other)) => {
TypedStatistics::Float(Float64Type {
min: this.min.min(other.min as f64),
max: this.max.max(other.max as f64),
})
}
(TypedStatistics::Float(this), TypedStatistics::Float(other)) => {
TypedStatistics::Float(Float64Type {
min: this.min.min(other.min),
Expand Down
38 changes: 18 additions & 20 deletions src/event/format/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,9 @@ pub trait EventFormat: Sized {
return Err(anyhow!("Schema mismatch"));
}
new_schema = update_field_type_in_schema(new_schema, None, time_partition, None);
new_schema = Arc::new(Schema::new(override_num_fields_from_schema(
new_schema.fields().to_vec(),
)));
let rb = Self::decode(data, new_schema.clone())?;
let tags_arr = StringArray::from_iter_values(std::iter::repeat(&tags).take(rb.num_rows()));
let metadata_arr =
Expand Down Expand Up @@ -205,24 +208,21 @@ pub fn override_timestamp_fields(
}

/// All number fields from inferred schema are forced into Float64
pub fn override_num_fields_from_schema(schema: Arc<Schema>) -> Arc<Schema> {
Arc::new(Schema::new(
schema
.fields()
.iter()
.map(|field| {
if field.data_type().is_numeric() {
Arc::new(Field::new(
field.name(),
DataType::Float64,
field.is_nullable(),
))
} else {
field.clone()
}
})
.collect::<Vec<Arc<Field>>>(),
))
pub fn override_num_fields_from_schema(schema: Vec<Arc<Field>>) -> Vec<Arc<Field>> {
schema
.iter()
.map(|field| {
if field.data_type().is_numeric() && field.data_type() != &DataType::Float64 {
Arc::new(Field::new(
field.name(),
DataType::Float64,
field.is_nullable(),
))
} else {
field.clone()
}
})
.collect::<Vec<Arc<Field>>>()
}

pub fn update_field_type_in_schema(
Expand All @@ -232,8 +232,6 @@ pub fn update_field_type_in_schema(
log_records: Option<&Vec<Value>>,
) -> Arc<Schema> {
let mut updated_schema = inferred_schema.clone();
updated_schema = override_num_fields_from_schema(updated_schema);

if let Some(existing_schema) = existing_schema {
let existing_fields = get_existing_fields(inferred_schema.clone(), Some(existing_schema));
let existing_timestamp_fields = get_existing_timestamp_fields(existing_schema);
Expand Down
14 changes: 14 additions & 0 deletions src/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,20 @@ impl StreamInfo {
Ok(Arc::new(schema))
}

/// update the schema in the metadata
pub fn set_schema(
&self,
stream_name: &str,
schema: HashMap<String, Arc<Field>>,
) -> Result<(), MetadataError> {
let mut map = self.write().expect(LOCK_EXPECT);
map.get_mut(stream_name)
.ok_or(MetadataError::StreamMetaNotFound(stream_name.to_string()))
.map(|metadata| {
metadata.schema = schema;
})
}

pub fn set_alert(&self, stream_name: &str, alerts: Alerts) -> Result<(), MetadataError> {
let mut map = self.write().expect(LOCK_EXPECT);
map.get_mut(stream_name)
Expand Down
18 changes: 16 additions & 2 deletions src/storage/object_storage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ use super::{
SCHEMA_FILE_NAME, STREAM_METADATA_FILE_NAME, STREAM_ROOT_DIRECTORY,
};

use crate::event::format::override_num_fields_from_schema;
use crate::handlers::http::modal::ingest_server::INGESTOR_META;
use crate::handlers::http::users::{DASHBOARDS_DIR, FILTER_DIR, USERS_ROOT_DIR};
use crate::metrics::{EVENTS_STORAGE_SIZE_DATE, LIFETIME_EVENTS_STORAGE_SIZE};
Expand All @@ -40,7 +41,7 @@ use crate::{
};

use actix_web_prometheus::PrometheusMetrics;
use arrow_schema::Schema;
use arrow_schema::{Field, Schema};
use async_trait::async_trait;
use bytes::Bytes;
use chrono::Local;
Expand Down Expand Up @@ -667,8 +668,21 @@ pub async fn commit_schema_to_storage(
schema: Schema,
) -> Result<(), ObjectStorageError> {
let storage = CONFIG.storage().get_object_store();
let stream_schema = storage.get_schema(stream_name).await?;
let mut stream_schema = storage.get_schema(stream_name).await?;
// override the data type of all numeric fields to Float64
//if data type is not Float64 already
stream_schema = Schema::new(override_num_fields_from_schema(
stream_schema.fields().iter().cloned().collect(),
));
let new_schema = Schema::try_merge(vec![schema, stream_schema]).unwrap();

//update the merged schema in the metadata and storage
let schema_map: HashMap<String, Arc<Field>> = new_schema
.fields()
.iter()
.map(|field| (field.name().clone(), Arc::clone(field)))
.collect();
let _ = STREAM_INFO.set_schema(stream_name, schema_map);
storage.put_schema(stream_name, &new_schema).await
}

Expand Down

0 comments on commit 8550152

Please sign in to comment.