-
Notifications
You must be signed in to change notification settings - Fork 2k
Spark dayname function implementation #20825
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
1da480c
412c9b5
7500ee6
7352c8b
56378d3
835585c
ddbbb43
8ecb079
b07e2ad
631e9bc
76bcf9c
e900486
7562581
8a7e413
1809710
989c812
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,126 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| use arrow::array::{Array, ArrayRef, AsArray, StringArray}; | ||
| use arrow::compute::{CastOptions, DatePart, cast_with_options, date_part}; | ||
| use arrow::datatypes::{DataType, Field, FieldRef, Int32Type}; | ||
| use datafusion::logical_expr::{ | ||
| Coercion, ColumnarValue, Signature, TypeSignature, TypeSignatureClass, Volatility, | ||
| }; | ||
| use datafusion_common::types::{logical_date, logical_string}; | ||
| use datafusion_common::utils::take_function_args; | ||
| use datafusion_common::{Result, internal_err}; | ||
| use datafusion_expr::{ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl}; | ||
| use datafusion_functions::utils::make_scalar_function; | ||
| use std::sync::Arc; | ||
|
|
||
| #[derive(Debug, PartialEq, Eq, Hash)] | ||
| pub struct SparkDayName { | ||
| signature: Signature, | ||
| } | ||
|
|
||
| impl Default for SparkDayName { | ||
| fn default() -> Self { | ||
| Self::new() | ||
| } | ||
| } | ||
|
|
||
| impl SparkDayName { | ||
| pub fn new() -> Self { | ||
| Self { | ||
| signature: Signature::one_of( | ||
| vec![ | ||
| TypeSignature::Coercible(vec![Coercion::new_exact( | ||
| TypeSignatureClass::Timestamp, | ||
| )]), | ||
| TypeSignature::Coercible(vec![Coercion::new_exact( | ||
| TypeSignatureClass::Native(logical_date()), | ||
| )]), | ||
| TypeSignature::Coercible(vec![Coercion::new_exact( | ||
| TypeSignatureClass::Native(logical_string()), | ||
| )]), | ||
| ], | ||
| Volatility::Immutable, | ||
| ), | ||
| } | ||
| } | ||
| } | ||
|
|
||
| impl ScalarUDFImpl for SparkDayName { | ||
| fn name(&self) -> &str { | ||
| "dayname" | ||
| } | ||
|
|
||
| fn signature(&self) -> &Signature { | ||
| &self.signature | ||
| } | ||
|
|
||
| fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> { | ||
| internal_err!("return_field_from_args should be used instead") | ||
| } | ||
|
|
||
| fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> { | ||
| Ok(Arc::new(Field::new( | ||
| self.name(), | ||
| DataType::Utf8, | ||
| args.arg_fields[0].is_nullable(), | ||
| ))) | ||
| } | ||
|
|
||
| fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> { | ||
| make_scalar_function(spark_day_name, vec![])(&args.args) | ||
| } | ||
| } | ||
|
|
||
| fn spark_day_name(args: &[ArrayRef]) -> Result<ArrayRef> { | ||
| let [array] = take_function_args("dayname", args)?; | ||
| match array.data_type() { | ||
| DataType::Date32 | DataType::Timestamp(_, _) => spark_day_name_inner(array), | ||
| DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 => { | ||
| let date_array = | ||
| cast_with_options(array, &DataType::Date32, &CastOptions::default())?; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What would happen in case of an invalid string / error here ? Could probably handle it here and throw an error if it is a malformed input ?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Invalid strings will be replaced with null values. |
||
| spark_day_name_inner(&date_array) | ||
| } | ||
| other => { | ||
| internal_err!("Unsupported arg {other:?} for Spark function `dayname`") | ||
| } | ||
| } | ||
| } | ||
|
|
||
| fn spark_day_name_inner(array: &ArrayRef) -> Result<ArrayRef> { | ||
| let result: StringArray = date_part(array, DatePart::DayOfWeekMonday0)? | ||
| .as_primitive::<Int32Type>() | ||
| .iter() | ||
| .map(|x| x.and_then(get_display_name)) | ||
| .collect(); | ||
| Ok(Arc::new(result)) | ||
| } | ||
|
|
||
| /// This function supports only the English locale, matching the behavior of Spark's | ||
| /// `dayname` function which return English day names regardless of the system or session locale. | ||
| fn get_display_name(day: i32) -> Option<String> { | ||
| match day { | ||
| 0 => Some(String::from("Mon")), | ||
| 1 => Some(String::from("Tue")), | ||
| 2 => Some(String::from("Wed")), | ||
| 3 => Some(String::from("Thu")), | ||
| 4 => Some(String::from("Fri")), | ||
| 5 => Some(String::from("Sat")), | ||
| 6 => Some(String::from("Sun")), | ||
| _ => None, | ||
kazantsev-maksim marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I dont think we are handling timezone's support here ? Spark handles timezone in current implementation and this could produce wrong results
kazantsev-maksim marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,81 @@ | ||
| # Licensed to the Apache Software Foundation (ASF) under one | ||
| # or more contributor license agreements. See the NOTICE file | ||
| # distributed with this work for additional information | ||
| # regarding copyright ownership. The ASF licenses this file | ||
| # to you under the Apache License, Version 2.0 (the | ||
| # "License"); you may not use this file except in compliance | ||
| # with the License. You may obtain a copy of the License at | ||
|
|
||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
|
|
||
| # Unless required by applicable law or agreed to in writing, | ||
| # software distributed under the License is distributed on an | ||
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| # KIND, either express or implied. See the License for the | ||
| # specific language governing permissions and limitations | ||
| # under the License. | ||
|
|
||
| query T | ||
| SELECT dayname('2008-02-20'::DATE); | ||
| ---- | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we also verify that the data type matches the spark data types (Itf8 / LargeUtf8 / Utf8View)
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. may be we could also add empty strings test cases as well ? |
||
| Wed | ||
|
|
||
| query T | ||
| SELECT dayname(NULL::DATE); | ||
| ---- | ||
kazantsev-maksim marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| NULL | ||
|
|
||
| query T | ||
| SELECT dayname('2026-03-09'::DATE); | ||
| ---- | ||
| Mon | ||
|
|
||
| query T | ||
| SELECT dayname('2026-03-08'::DATE); | ||
| ---- | ||
| Sun | ||
|
|
||
| query T | ||
| SELECT dayname('1948-08-10'::DATE); | ||
| ---- | ||
| Tue | ||
|
|
||
| query T | ||
| SELECT dayname('1987-11-13'::DATE); | ||
| ---- | ||
| Fri | ||
|
|
||
| query T | ||
| SELECT dayname('2000-07-27'::DATE); | ||
| ---- | ||
| Thu | ||
|
|
||
| query T | ||
| SELECT dayname('2010-04-24'::DATE); | ||
| ---- | ||
| Sat | ||
|
|
||
| query T | ||
| SELECT dayname('2010-04-24'::STRING); | ||
| ---- | ||
| Sat | ||
|
|
||
| query T | ||
| SELECT dayname(NULL::STRING); | ||
| ---- | ||
| NULL | ||
|
|
||
| query T | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tested this in Spark 4.1.1 and found that the empty string behavior depends on ANSI mode: The current implementation returns NULL, which matches the non-ANSI behavior. Since Spark 4 defaults to ANSI mode, it might be worth supporting both behaviors here — maybe through an |
||
| SELECT dayname(''::STRING); | ||
| ---- | ||
| NULL | ||
|
|
||
| query T | ||
| SELECT dayname('2010-04-24'::TIMESTAMP); | ||
| ---- | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we also tests to add actual timestamps instead of dates parsed as timestamps here ? |
||
| Sat | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We might need tests to cover various timezones (atleast one apart from UTC) :) |
||
|
|
||
| query T | ||
| SELECT dayname(NULL::TIMESTAMP); | ||
| ---- | ||
| NULL | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Spark's date
java.util.dateisDate32on Rust side of things . Given that we are not handlingDate64in the match arm , do you still think we should havelogical_date()as one of the signature supporting Date32 / Date64 ? Perhaps we could change to Date32 ?