Skip to content
This repository was archived by the owner on May 7, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
7a30750
feat: add pd.get_dummies
Oct 26, 2023
1b3fae0
Merge branch 'main' of github.com:googleapis/python-bigquery-datafram…
Oct 26, 2023
0aca97c
remove unneeded prefix case
Oct 26, 2023
706fa35
Merge branch 'main' into b297352026-get-dummies
milkshakeiii Oct 26, 2023
cb2a8b1
Merge branch 'main' of github.com:googleapis/python-bigquery-datafram…
Oct 27, 2023
0781c08
param/documentation fixes
Oct 27, 2023
eec7822
be stricter about types in test
Oct 27, 2023
bdee75e
be stricter about types in series test
Oct 27, 2023
15b1aa3
remove unneeded comment
Oct 27, 2023
ef57790
Merge branch 'b297352026-get-dummies' of github.com:googleapis/python…
Oct 27, 2023
0f7c38e
adjust for type difference in pandas 1
Oct 27, 2023
b060475
add example code (tested)
Oct 30, 2023
ce5ea69
fix None columns and add test cases
Oct 30, 2023
758bd6d
variable names and _get_unique_values per-column
Oct 30, 2023
dab3eba
account for pandas 1 behavior difference
Oct 30, 2023
b2032e1
remove already_seen set
Oct 30, 2023
1899a58
avoid unnecessary join/projection
Oct 30, 2023
1a71217
fix column ordering edge case
Oct 30, 2023
257531a
adjust for picky examples checker
Oct 30, 2023
87b358e
example tweak
Oct 31, 2023
979eb39
make part of the example comments
Oct 31, 2023
5b3dc18
use ellipsis in doctest comment
Oct 31, 2023
aa7a0a3
add <BLANKLINES> to doctest string
Oct 31, 2023
9db8707
Merge branch 'main' into b297352026-get-dummies
milkshakeiii Oct 31, 2023
f178012
extract parameter standardization
Nov 1, 2023
cc4aa4c
extract submethods
Nov 1, 2023
6bc36a5
Merge branch 'main' of github.com:googleapis/python-bigquery-datafram…
Nov 1, 2023
3fcdd5f
Merge branch 'b297352026-get-dummies' of github.com:googleapis/python…
Nov 1, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
177 changes: 177 additions & 0 deletions bigframes/pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,18 @@
)

import bigframes._config as config
import bigframes.constants as constants
import bigframes.core.blocks
import bigframes.core.global_session as global_session
import bigframes.core.indexes
import bigframes.core.reshape
import bigframes.dataframe
import bigframes.operations as ops
import bigframes.series
import bigframes.session
import bigframes.session.clients
import third_party.bigframes_vendored.pandas.core.reshape.concat as vendored_pandas_concat
import third_party.bigframes_vendored.pandas.core.reshape.encoding as vendored_pandas_encoding
import third_party.bigframes_vendored.pandas.core.reshape.merge as vendored_pandas_merge
import third_party.bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile

Expand Down Expand Up @@ -134,6 +138,179 @@ def cut(
cut.__doc__ = vendored_pandas_tile.cut.__doc__


def get_dummies(
data: Union[DataFrame, Series],
prefix: Union[List, dict, str, None] = None,
prefix_sep: Union[List, dict, str, None] = "_",
dummy_na: bool = False,
columns: Optional[List] = None,
drop_first: bool = False,
dtype: Any = None,
) -> DataFrame:
# simplify input parameters into per-input-label lists
# also raise errors for invalid parameters
column_labels, prefixes, prefix_seps = _standardize_get_dummies_params(
data, prefix, prefix_sep, columns, dtype
)

# combine prefixes into per-column-id list
full_columns_prefixes, columns_ids = _determine_get_dummies_columns_from_labels(
data, column_labels, prefix is not None, prefixes, prefix_seps
)

# run queries to compute unique values
block = data._block
max_unique_value = (
bigframes.core.blocks._BQ_MAX_COLUMNS
- len(block.value_columns)
- len(block.index_columns)
- 1
) // len(column_labels)
columns_values = [
block._get_unique_values([col_id], max_unique_value) for col_id in columns_ids
]

# for each dummified column, add the content of the output columns via block operations
intermediate_col_ids = []
for i in range(len(columns_values)):
level = columns_values[i].get_level_values(0).sort_values().dropna()
if drop_first:
level = level[1:]
column_label = full_columns_prefixes[i]
column_id = columns_ids[i]
block, new_intermediate_col_ids = _perform_get_dummies_block_operations(
block, level, column_label, column_id, dummy_na
)
intermediate_col_ids.extend(new_intermediate_col_ids)

# drop dummified columns (and the intermediate columns we added)
block = block.drop_columns(columns_ids + intermediate_col_ids)
return DataFrame(block)


get_dummies.__doc__ = vendored_pandas_encoding.get_dummies.__doc__


def _standardize_get_dummies_params(
data: Union[DataFrame, Series],
prefix: Union[List, dict, str, None],
prefix_sep: Union[List, dict, str, None],
columns: Optional[List],
dtype: Any,
) -> Tuple[List, List[str], List[str]]:
block = data._block

if isinstance(data, Series):
columns = [block.column_labels[0]]
if columns is not None and not pandas.api.types.is_list_like(columns):
raise TypeError("Input must be a list-like for parameter `columns`")
if dtype is not None and dtype not in [
pandas.BooleanDtype,
bool,
"Boolean",
"boolean",
"bool",
]:
raise NotImplementedError(
f"Only Boolean dtype is currently supported. {constants.FEEDBACK_LINK}"
)

if columns is None:
default_dummy_types = [pandas.StringDtype, "string[pyarrow]"]
columns = []
columns_set = set()
for col_id in block.value_columns:
label = block.col_id_to_label[col_id]
if (
label not in columns_set
and block.expr.get_column_type(col_id) in default_dummy_types
):
columns.append(label)
columns_set.add(label)

column_labels: List = typing.cast(List, columns)

def parse_prefix_kwarg(kwarg, kwarg_name) -> Optional[List[str]]:
if kwarg is None:
return None
if isinstance(kwarg, str):
return [kwarg] * len(column_labels)
if isinstance(kwarg, dict):
return [kwarg[column] for column in column_labels]
kwarg = typing.cast(List, kwarg)
if pandas.api.types.is_list_like(kwarg) and len(kwarg) != len(column_labels):
raise ValueError(
f"Length of '{kwarg_name}' ({len(kwarg)}) did not match "
f"the length of the columns being encoded ({len(column_labels)})."
)
if pandas.api.types.is_list_like(kwarg):
return list(map(str, kwarg))
raise TypeError(f"{kwarg_name} kwarg must be a string, list, or dictionary")

prefix_seps = parse_prefix_kwarg(prefix_sep or "_", "prefix_sep")
prefix_seps = typing.cast(List, prefix_seps)
prefixes = parse_prefix_kwarg(prefix, "prefix")
if prefixes is None:
prefixes = column_labels
prefixes = typing.cast(List, prefixes)
Comment thread
milkshakeiii marked this conversation as resolved.

return column_labels, prefixes, prefix_seps


def _determine_get_dummies_columns_from_labels(
data: Union[DataFrame, Series],
column_labels: List,
prefix_given: bool,
prefixes: List[str],
prefix_seps: List[str],
) -> Tuple[List[str], List[str]]:
block = data._block

columns_ids = []
columns_prefixes = []
for i in range(len(column_labels)):
label = column_labels[i]
empty_prefix = label is None or (isinstance(data, Series) and not prefix_given)
full_prefix = "" if empty_prefix else prefixes[i] + prefix_seps[i]

for col_id in block.label_to_col_id[label]:
columns_ids.append(col_id)
columns_prefixes.append(full_prefix)

return columns_prefixes, columns_ids


def _perform_get_dummies_block_operations(
block: bigframes.core.blocks.Block,
level: pandas.Index,
column_label: str,
column_id: str,
dummy_na: bool,
) -> Tuple[bigframes.core.blocks.Block, List[str]]:
intermediate_col_ids = []
for value in level:
new_column_label = f"{column_label}{value}"
if column_label == "":
new_column_label = value
new_block, new_id = block.apply_unary_op(
column_id, ops.BinopPartialLeft(ops.eq_op, value)
)
intermediate_col_ids.append(new_id)
block, _ = new_block.apply_unary_op(
new_id,
ops.BinopPartialRight(ops.fillna_op, False),
result_label=new_column_label,
)
if dummy_na:
# dummy column name for na depends on the dtype
na_string = str(pandas.Index([None], dtype=level.dtype)[0])
new_column_label = f"{column_label}{na_string}"
block, _ = block.apply_unary_op(
column_id, ops.isnull_op, result_label=new_column_label
)
return block, intermediate_col_ids


def qcut(
x: bigframes.series.Series,
q: int,
Expand Down
112 changes: 112 additions & 0 deletions tests/system/small/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,118 @@ def test_concat_series(scalars_dfs):
pd.testing.assert_series_equal(bf_result, pd_result)


@pytest.mark.parametrize(
("kwargs"),
[
{
"prefix": ["prefix1", "prefix2"],
"prefix_sep": "_",
"dummy_na": None,
"columns": ["bool_col", "int64_col"],
"drop_first": False,
},
{
"prefix": "prefix",
"prefix_sep": ["_", ","],
"dummy_na": False,
"columns": ["int64_too", "string_col"],
"drop_first": False,
},
{
"prefix": None,
"prefix_sep": ".",
"dummy_na": True,
"columns": ["time_col", "float64_col"],
"drop_first": True,
},
],
)
def test_get_dummies_dataframe(scalars_dfs, kwargs):
scalars_df, scalars_pandas_df = scalars_dfs

bf_result = bpd.get_dummies(scalars_df, **kwargs, dtype=bool)
pd_result = pd.get_dummies(scalars_pandas_df, **kwargs, dtype=bool)
# dtype argument above is needed for pandas v1 only

# adjust for expected dtype differences
for (column_name, type_name) in zip(pd_result.columns, pd_result.dtypes):
if type_name == "bool":
pd_result[column_name] = pd_result[column_name].astype("boolean")

pd.testing.assert_frame_equal(bf_result.to_pandas(), pd_result)


def test_get_dummies_dataframe_duplicate_labels(scalars_dfs):
if pd.__version__.startswith("1."):
pytest.skip("pandas has different behavior in 1.x")

scalars_df, scalars_pandas_df = scalars_dfs

scalars_renamed_df = scalars_df.rename(
columns={"int64_too": "int64_col", "float64_col": None, "string_col": None}
)
scalars_renamed_pandas_df = scalars_pandas_df.rename(
columns={"int64_too": "int64_col", "float64_col": None, "string_col": None}
)

bf_result = bpd.get_dummies(
scalars_renamed_df, columns=["int64_col", None], dtype=bool
)
pd_result = pd.get_dummies(
scalars_renamed_pandas_df, columns=["int64_col", None], dtype=bool
)
# dtype argument above is needed for pandas v1 only

# adjust for expected dtype differences
for (column_name, type_name) in zip(pd_result.columns, pd_result.dtypes):
if type_name == "bool":
pd_result[column_name] = pd_result[column_name].astype("boolean")

pd.testing.assert_frame_equal(bf_result.to_pandas(), pd_result)


def test_get_dummies_series(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
bf_series = scalars_df.date_col
pd_series = scalars_pandas_df.date_col

bf_result = bpd.get_dummies(bf_series, dtype=bool)
pd_result = pd.get_dummies(pd_series, dtype=bool)
# dtype argument above is needed for pandas v1 only

# adjust for expected dtype differences
for (column_name, type_name) in zip(pd_result.columns, pd_result.dtypes):
if type_name == "bool":
pd_result[column_name] = pd_result[column_name].astype("boolean")
pd_result.columns = pd_result.columns.astype(object)

pd.testing.assert_frame_equal(
bf_result.to_pandas(),
pd_result,
)


def test_get_dummies_series_nameless(scalars_dfs):
scalars_df, scalars_pandas_df = scalars_dfs
bf_series = scalars_df.date_col.rename(None)
pd_series = scalars_pandas_df.date_col.rename(None)

bf_result = bpd.get_dummies(bf_series, dtype=bool)
pd_result = pd.get_dummies(pd_series, dtype=bool)
# dtype argument above is needed for pandas v1 only

# adjust for expected dtype differences
for (column_name, type_name) in zip(pd_result.columns, pd_result.dtypes):
if type_name == "bool":
pd_result[column_name] = pd_result[column_name].astype("boolean")
pd_result.columns = pd_result.columns.astype(object)

pd.testing.assert_frame_equal(
bf_result.to_pandas(),
pd_result,
)


@pytest.mark.parametrize(
("how"),
[
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/reshape/concat.py
"""
Concat routines.
Concat routines
"""
from __future__ import annotations

Expand Down
Loading