diff --git a/packages/bigframes/noxfile.py b/packages/bigframes/noxfile.py index 5dba688d3c4a..09364c4e6ff9 100644 --- a/packages/bigframes/noxfile.py +++ b/packages/bigframes/noxfile.py @@ -605,11 +605,11 @@ def prerelease(session: nox.sessions.Session, tests_path, extra_pytest_options=( # Workaround https://github.com/googleapis/python-db-dtypes-pandas/issues/178 "db-dtypes", # Ensure we catch breaking changes in the client libraries early. - "git+https://github.com/googleapis/python-bigquery.git#egg=google-cloud-bigquery", + "git+https://github.com/googleapis/google-cloud-python.git#egg=google-cloud-bigquery&subdirectory=packages/google-cloud-bigquery", "--upgrade", "-e", "git+https://github.com/googleapis/google-cloud-python.git#egg=google-cloud-bigquery-storage&subdirectory=packages/google-cloud-bigquery-storage", - "git+https://github.com/googleapis/python-bigquery-pandas.git#egg=pandas-gbq", + "git+https://github.com/googleapis/google-cloud-python.git#egg=pandas-gbq&subdirectory=packages/pandas-gbq", ) # Print out prerelease package versions. diff --git a/packages/bigframes/specs/bigframes-bigquery-contributing.md b/packages/bigframes/specs/bigframes-bigquery-contributing.md new file mode 100644 index 000000000000..10931af0755f --- /dev/null +++ b/packages/bigframes/specs/bigframes-bigquery-contributing.md @@ -0,0 +1,501 @@ +# bigframes.bigquery inputs and outputs policies + +The goal of the [bigframes.bigquery +APIs](https://dataframes.bigquery.dev/reference/api/bigframes.bigquery.html#module-bigframes.bigquery) +is to provide the simplest possible mapping from BigQuery (GoogleSQL) +[functions](https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/functions-all) +and +[operations](https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax) +to Python. "Simplest" is somewhat ambiguous though, when it comes to the types +involved and behaviors, so this document aims to expand on that vision with +specific examples. + +## SQL and BigFrames expression types + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
SQL expression type(s) + Python type(s) + Notes + Examples +
Column expression (usable in a SELECT clause) + + + Both Python Series and column expression should be supported as inputs, + with the output reflecting the users input. Use a TypeVar + rather than directly using union types to make type checking easier. +

+Special considerations for Series inputs: +

+If an input and output are both a Series with the same number of rows, make sure +the output Series is implicitly (row identity) alignable with the original +input. In other words, don't generate a table expression. +

+If there are multiple Series inputs, they should be implicitly aligned if +possible so as not to generate unnecessary table expressions. +

Most scalar functions accept one or more column expressions as input. +
Scalar values + + + Theoretically, we could try to get the type system to help the user + disambiguate between this case and the "Column expression" case, but I think + that's more trouble than it it's worth with regards to the expectations of + Python users. + + +
Table expression + bpd.DataFrame +

+All columns are included as normal columns in the input table expression, +including named index columns. If column names aren't unique or contain +characters not compatible with BigQuery flexible column names, raise an error. +

+Outputs are unordered and unindexed to allow for cleaner mapping with SQL. +

Most APIs that take a table expression as input, also output a table + expression with the same number of rows and passing through all unused + columns. + +

This should be used to pass through any index or ordering columns (as well + as all other columns, if that's the SQL behavior), to allow for easy joining + with the original input DataFrame. +

Same number of rows as the input, so we should preserve index and ordering: + + + +

+ Different number of rows in output, so no need to preserve index or ordering. + Default index / ordering should be specified with the Session's + configuration: + +

+ +

+ Possible to have the same number of rows as the input, but joining with the original goes against the purpose of the feature: + +

+ +
Table name + string (referring to fully-qualified table ID, e.g. project.dataset.table / project.catalog.namespace.table) + Some SQL APIs do not support or have limitations with arbitrary table expressions, instead taking in a table ID, such as TABLESAMPLE expression. +

+Also, SEARCH and VECTOR_SEARCH, if you want the indexes attached to the table to actually apply. +

+For outputs, it might be preferable to output a table ID instead of a DataFrame, if the user is explicitly creating a table. For example, to_gbq() returns a string with the table name, which is useful for the case where BigFrame generates the table ID for the user. +

All of the items from the "Table expression" row above. APIs that require a table expression, but don't take a table ID can trivially take a table ID through a (SELECT * FROM table) subquery. +

+Some APIs only take a table ID and not an arbitrary table expression:

+ +
Aggregated table expression + DataFrameGroupBy + + + +
Analytic table expression +
    + +
  • DataFrameGroupBy - feasibility TBD +
  • Deferred column Expression with a Window applied.
+ +
+ + +
Column name (unqualified*) \ + \ +*I've only encountered examples where the table name / table expression is passed in separately. + string, +

+For cases where the column name is used as an alias and we aren't using named Series: +

+dict[str, Expression] +

Often a table expression input is paired with a column name input, as is the case with the CREATE MODEL and VECTOR_SEARCH APIs +

+If SQL expects a column name rather than a column expression, do not attempt to change this in Python. For example, don't allow a Series as a substitute for DataFrames + Column name. \ + \ +If the associated table expression is input as a DataFrame, validate that these map cleanly to SQL and raise a ValueError if not. For example: \ +

    + +
  • Duplicate column names (excluding unnamed index columns). +
  • Column names that are some hashable value other than integer (which maps cleanly to a column name) or string. +
  • Any column name containing a punctuation mark that is not allowed by BigQuery flexible column names, such as ! or $.
+ +
+ +
Literal values + corresponding literal Python value (e.g. int, float, string) + For cases where scalar values are also supported, it should be safe to start with this and then expand to support expressions without a breaking change, as is done in https://github.com/googleapis/google-cloud-python/pull/16606. + Most scalar functions accept one or more literal values as input. +
Scalar subqueries + Not supported yet, except implicitly in some aggregation use cases. +

+Would need some sort of bigframes deferred expression that can be tied to a table expression. +

+(Possibly DataFrame with 1 column?) +

+ +
+ +## Python policies + +### Naming + +Take the SQL function name, keyword name (used as a function name in Python), or argument name and transform them to lower_snake_case to reflect Python conventions. + +### Internal expressions + +Prefer creating deferred BigFrames expression objects where feasible. For +example, all scalar outputting functions should return a +`bigframes.pandas.Series` or `bigframes.core.col.Expression` that wraps a +`bigframes.core.expression.Expression`. + +Prefer returning a `bigframes.pandas.DataFrame` that wraps a +`bigframes.bigframes.core.bigframe_node.BigFrameNode`. See `from_bq_data_source` in +`bigframes.core.array_value.ArrayValue`, as an example. + +Exceptions to this are cases where the output schema is likely to evolve or +differ in ways that are difficult to model, such as the `ML.PREDICT` SQL +function, where output columns differ based on the model type and support for +model types are frequently added to BigQuery. In these exceptional cases, the +generated query should run immediately and the returned value should wrap the +results. + +### Argument syntax details + +Arguments in Python can be one of: + +* Positional + * Supported by `*args` in Python, but not recommended. Positional arguments in SQL should map to named positional or keyword arguments in Python. +* Positional or keyword + * Required positional arguments should be positional, just like they are in SQL. +* Keyword-only + * All other arguments should be keyword-only. Use `, * ,` Python syntax to achieve this. + +For optional parameters, use an optional sentinel (see: ) and omit the value from the generated SQL if the user doesn't explicitly provide one. This ensures that an explicit NULL / None value can be passed in. + +``` + +from enum import Enum + +class Default(Enum): + token = 0 + +DEFAULT = Default.token + +def spam(*, ham: list[str] | None | Default = DEFAULT): + op_kwargs = {} + + if ham is not DEFAULT: + op_kwargs['ham'] = "prosciutto" + + ... + +``` + +### Scalar operations types policies + +Many operations output a table expression. For these, the output type is always a DataFrame, regardless of the input types. + +For scalar operations, there are three cases to consider when determining the output types: + + + + + + + + + + + + + + + + + + +
Scalar ops - Input type(s) + Scalar ops - Output type +
Expression + Expression +
Series / DataFrame + Series / DataFrame +

+Preserve ordering and index(es). Join inputs as needed before applying the operation. +

Mix of Expression and Series / DataFrame + Series / DataFrame +

+Preserve ordering and index(es). Join inputs as needed before applying the operation. +

+ +## Examples + +### PIVOT SQL operator + +SQL syntax ([docs](https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#pivot_operator)): + +``` +FROM from_item[, ...] pivot_operator + +pivot_operator: + PIVOT( + aggregate_function_call [as_alias][, ...] + FOR input_column + IN ( pivot_column [as_alias][, ...] ) + ) [AS alias] + +as_alias: + [AS] alias + +``` + +SQL example: + +``` +WITH Produce AS ( + SELECT 'Kale' as product, 51 as sales, 'Q1' as quarter, 2020 as year UNION ALL + SELECT 'Kale', 23, 'Q2', 2020 UNION ALL + SELECT 'Kale', 45, 'Q3', 2020 UNION ALL + SELECT 'Kale', 3, 'Q4', 2020 UNION ALL + SELECT 'Kale', 70, 'Q1', 2021 UNION ALL + SELECT 'Kale', 85, 'Q2', 2021 UNION ALL + SELECT 'Apple', 77, 'Q1', 2020 UNION ALL + SELECT 'Apple', 0, 'Q2', 2020 UNION ALL + SELECT 'Apple', 1, 'Q1', 2021) +SELECT * FROM Produce + +/*---------+-------+---------+------+ + | product | sales | quarter | year | + +---------+-------+---------+------| + | Kale | 51 | Q1 | 2020 | + | Kale | 23 | Q2 | 2020 | + | Kale | 45 | Q3 | 2020 | + | Kale | 3 | Q4 | 2020 | + | Kale | 70 | Q1 | 2021 | + | Kale | 85 | Q2 | 2021 | + | Apple | 77 | Q1 | 2020 | + | Apple | 0 | Q2 | 2020 | + | Apple | 1 | Q1 | 2021 | + +---------+-------+---------+------*/ + + +SELECT * FROM + Produce + PIVOT(SUM(sales) FOR quarter IN ('Q1', 'Q2', 'Q3', 'Q4')) + +/*---------+------+----+------+------+------+ + | product | year | Q1 | Q2 | Q3 | Q4 | + +---------+------+----+------+------+------+ + | Apple | 2020 | 77 | 0 | NULL | NULL | + | Apple | 2021 | 1 | NULL | NULL | NULL | + | Kale | 2020 | 51 | 23 | 45 | 3 | + | Kale | 2021 | 70 | 85 | NULL | NULL | + +---------+------+----+------+------+------*/ + +``` + +Python definition: + +``` +def pivot( + table_expression: bpd.DataFrame, + *, + aggregation: Expression | dict[str, Expression], + input_column: str, + pivot_columns: dict[str, float | str | ...] | Sequence[float | str | ...], +) -> bpd.DataFrame: + ... +``` + +Since pivot creates a table expression, we run immediately. + + \ +Python usage: + +``` +pivotted = bbq.pivot( + my_produce_dataframe, + aggregation=bpd.col("sales").sum(), + input_column="quarter", + pivot_columns=["Q1", "Q2", "Q3", "Q4"], +) +``` + +### UNPIVOT SQL operator + +SQL syntax ([docs](https://docs.cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#unpivot_operator)): + +``` +FROM from_item[, ...] unpivot_operator + +unpivot_operator: + UNPIVOT [ { INCLUDE NULLS | EXCLUDE NULLS } ] ( + { single_column_unpivot | multi_column_unpivot } + ) [unpivot_alias] + +single_column_unpivot: + values_column + FOR name_column + IN (columns_to_unpivot) + +multi_column_unpivot: + values_column_set + FOR name_column + IN (column_sets_to_unpivot) + +values_column_set: + (values_column[, ...]) + +columns_to_unpivot: + unpivot_column [row_value_alias][, ...] + +column_sets_to_unpivot: + (unpivot_column [row_value_alias][, ...]) + +unpivot_alias and row_value_alias: + [AS] alias +``` + +SQL example: + +``` +WITH Produce AS ( + SELECT 'Kale' as product, 51 as Q1, 23 as Q2, 45 as Q3, 3 as Q4 UNION ALL + SELECT 'Apple', 77, 0, 25, 2) + +-- SELECT * FROM Produce +/*---------+----+----+----+----+ + | product | Q1 | Q2 | Q3 | Q4 | + +---------+----+----+----+----+ + | Kale | 51 | 23 | 45 | 3 | + | Apple | 77 | 0 | 25 | 2 | + +---------+----+----+----+----*/ + +SELECT * FROM Produce +UNPIVOT(sales FOR quarter IN (Q1, Q2, Q3, Q4)) -- single_column_unpivot + +/*---------+-------+---------+ + | product | sales | quarter | + +---------+-------+---------+ + | Kale | 51 | Q1 | + | Kale | 23 | Q2 | + | Kale | 45 | Q3 | + | Kale | 3 | Q4 | + | Apple | 77 | Q1 | + | Apple | 0 | Q2 | + | Apple | 25 | Q3 | + | Apple | 2 | Q4 | + +---------+-------+---------*/ +``` + +Python definition: + +``` +def unpivot( + table_expression: bpd.DataFrame, + *, + exclude_nulls: bool = True, + values_column: str | Sequence[str], + name_column: str, + columns_to_unpivot: dict[str, str | int] | Sequence[str], +) -> bpd.DataFrame: + ... +``` + +Since unpivot creates a table expression, we run immediately. + + \ +Python usage: + +``` +unpivotted = bbq.unpivot( + my_produce_dataframe, + values_column="sales", + name_column="quarter", + columns_to_unpivot=["Q1", "Q2", "Q3", "Q4"], +) +``` diff --git a/packages/bigframes/tests/system/small/test_series.py b/packages/bigframes/tests/system/small/test_series.py index c1ca6ebba55f..5df88e930432 100644 --- a/packages/bigframes/tests/system/small/test_series.py +++ b/packages/bigframes/tests/system/small/test_series.py @@ -33,6 +33,7 @@ import bigframes.pandas import bigframes.series as series import bigframes.testing +import bigframes.testing.utils from bigframes.testing.utils import ( assert_frame_equal, assert_series_equal, @@ -1232,23 +1233,12 @@ def test_divmods_series(scalars_dfs, col_x, col_y, method): scalars_pandas_df[col_y] ) # BigQuery's mod functions return NUMERIC values for non-INT64 inputs. - if bf_div_result.dtype == pd.Int64Dtype(): - bigframes.testing.utils.assert_series_equal( - pd_div_result, bf_div_result.to_pandas(), check_dtype=False - ) - else: - bigframes.testing.utils.assert_series_equal( - pd_div_result, bf_div_result.astype("Float64").to_pandas() - ) - - if bf_mod_result.dtype == pd.Int64Dtype(): - bigframes.testing.utils.assert_series_equal( - pd_mod_result, bf_mod_result.to_pandas() - ) - else: - bigframes.testing.utils.assert_series_equal( - pd_mod_result, bf_mod_result.astype("Float64").to_pandas() - ) + bigframes.testing.utils.assert_series_equal( + pd_div_result, bf_div_result.to_pandas(), check_dtype=False + ) + bigframes.testing.utils.assert_series_equal( + pd_mod_result, bf_mod_result.to_pandas(), check_dtype=False + ) @pytest.mark.parametrize(