Skip to content
This repository was archived by the owner on May 7, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions bigframes/operations/blob.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,7 @@ def get_runtime_json_str(
def exif(
self,
*,
engine: Literal[None, "pillow"] = None,
connection: Optional[str] = None,
max_batching_rows: int = 8192,
container_cpu: Union[float, int] = 0.33,
Expand All @@ -311,6 +312,7 @@ def exif(
"""Extract EXIF data. Now only support image types.

Args:
engine ('pillow' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
connection (str or None, default None): BQ connection used for function internet transactions, and the output blob if "dst" is str. If None, uses default connection of the session.
max_batching_rows (int, default 8,192): Max number of rows per batch send to cloud run to execute the function.
container_cpu (int or float, default 0.33): number of container CPUs. Possible values are [0.33, 8]. Floats larger than 1 are cast to intergers.
Expand All @@ -319,6 +321,8 @@ def exif(
Returns:
bigframes.series.Series: JSON series of key-value pairs.
"""
if engine is None or engine.casefold() != "pillow":
raise ValueError("Must specify the engine, supported value is 'pillow'.")

import bigframes.bigquery as bbq
import bigframes.blob._functions as blob_func
Expand All @@ -344,6 +348,7 @@ def image_blur(
self,
ksize: tuple[int, int],
*,
engine: Literal[None, "opencv"] = None,
dst: Optional[Union[str, bigframes.series.Series]] = None,
connection: Optional[str] = None,
max_batching_rows: int = 8192,
Expand All @@ -354,6 +359,7 @@ def image_blur(

Args:
ksize (tuple(int, int)): Kernel size.
engine ('opencv' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
dst (str or bigframes.series.Series or None, default None): Output destination. Can be one of:
str: GCS folder str. The output filenames are the same as the input files.
blob Series: The output file paths are determined by the uris of the blob Series.
Expand All @@ -367,6 +373,9 @@ def image_blur(
Returns:
bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ.
"""
if engine is None or engine.casefold() != "opencv":
raise ValueError("Must specify the engine, supported value is 'opencv'.")

import bigframes.blob._functions as blob_func

connection = self._resolve_connection(connection)
Expand Down Expand Up @@ -424,6 +433,7 @@ def image_resize(
self,
dsize: tuple[int, int] = (0, 0),
*,
engine: Literal[None, "opencv"] = None,
fx: float = 0.0,
fy: float = 0.0,
dst: Optional[Union[str, bigframes.series.Series]] = None,
Expand All @@ -436,6 +446,7 @@ def image_resize(

Args:
dsize (tuple(int, int), default (0, 0)): Destination size. If set to 0, fx and fy parameters determine the size.
engine ('opencv' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
fx (float, default 0.0): scale factor along the horizontal axis. If set to 0.0, dsize parameter determines the output size.
fy (float, defalut 0.0): scale factor along the vertical axis. If set to 0.0, dsize parameter determines the output size.
dst (str or bigframes.series.Series or None, default None): Output destination. Can be one of:
Expand All @@ -451,6 +462,9 @@ def image_resize(
Returns:
bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ.
"""
if engine is None or engine.casefold() != "opencv":
raise ValueError("Must specify the engine, supported value is 'opencv'.")

dsize_set = dsize[0] > 0 and dsize[1] > 0
fsize_set = fx > 0.0 and fy > 0.0
if not dsize_set ^ fsize_set:
Expand Down Expand Up @@ -516,6 +530,7 @@ def image_resize(
def image_normalize(
self,
*,
engine: Literal[None, "opencv"] = None,
alpha: float = 1.0,
beta: float = 0.0,
norm_type: str = "l2",
Expand All @@ -528,6 +543,7 @@ def image_normalize(
"""Normalize images.

Args:
engine ('opencv' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
alpha (float, default 1.0): Norm value to normalize to or the lower range boundary in case of the range normalization.
beta (float, default 0.0): Upper range boundary in case of the range normalization; it is not used for the norm normalization.
norm_type (str, default "l2"): Normalization type. Accepted values are "inf", "l1", "l2" and "minmax".
Expand All @@ -544,6 +560,9 @@ def image_normalize(
Returns:
bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ.
"""
if engine is None or engine.casefold() != "opencv":
raise ValueError("Must specify the engine, supported value is 'opencv'.")

import bigframes.blob._functions as blob_func

connection = self._resolve_connection(connection)
Expand Down Expand Up @@ -604,6 +623,7 @@ def image_normalize(
def pdf_extract(
self,
*,
engine: Literal[None, "pypdf"] = None,
connection: Optional[str] = None,
max_batching_rows: int = 1,
container_cpu: Union[float, int] = 2,
Expand All @@ -613,6 +633,7 @@ def pdf_extract(
"""Extracts text from PDF URLs and saves the text as string.

Args:
engine ('pypdf' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
connection (str or None, default None): BQ connection used for
function internet transactions, and the output blob if "dst"
is str. If None, uses default connection of the session.
Expand All @@ -631,6 +652,9 @@ def pdf_extract(
Contains the extracted text from the PDF file.
Includes error messages if verbosity is enabled.
"""
if engine is None or engine.casefold() != "pypdf":
raise ValueError("Must specify the engine, supported value is 'pypdf'.")

import bigframes.bigquery as bbq
import bigframes.blob._functions as blob_func
import bigframes.pandas as bpd
Expand Down Expand Up @@ -663,6 +687,7 @@ def pdf_extract(
def pdf_chunk(
self,
*,
engine: Literal[None, "pypdf"] = None,
connection: Optional[str] = None,
chunk_size: int = 2000,
overlap_size: int = 200,
Expand All @@ -675,6 +700,7 @@ def pdf_chunk(
arrays of strings.

Args:
engine ('pypdf' or None, default None): The engine (bigquery or third party library) used for the function. The value must be specified.
connection (str or None, default None): BQ connection used for
function internet transactions, and the output blob if "dst"
is str. If None, uses default connection of the session.
Expand All @@ -698,6 +724,8 @@ def pdf_chunk(
where each string is a chunk of text extracted from PDF.
Includes error messages if verbosity is enabled.
"""
if engine is None or engine.casefold() != "pypdf":
raise ValueError("Must specify the engine, supported value is 'pypdf'.")

import bigframes.bigquery as bbq
import bigframes.blob._functions as blob_func
Expand Down Expand Up @@ -740,6 +768,7 @@ def pdf_chunk(
def audio_transcribe(
self,
*,
engine: Literal["bigquery"] = "bigquery",
connection: Optional[str] = None,
model_name: Optional[
Literal[
Expand All @@ -753,6 +782,7 @@ def audio_transcribe(
Transcribe audio content using a Gemini multimodal model.

Args:
engine ('bigquery'): The engine (bigquery or third party library) used for the function.
connection (str or None, default None): BQ connection used for
function internet transactions, and the output blob if "dst"
is str. If None, uses default connection of the session.
Expand All @@ -770,6 +800,9 @@ def audio_transcribe(
Contains the transcribed text from the audio file.
Includes error messages if verbosity is enabled.
"""
if engine.casefold() != "bigquery":
raise ValueError("Must specify the engine, supported value is 'bigquery'.")

import bigframes.bigquery as bbq
import bigframes.ml.llm as llm
import bigframes.pandas as bpd
Expand Down
9 changes: 5 additions & 4 deletions notebooks/multimodal/multimodal_dataframe.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -254,16 +254,17 @@
"outputs": [],
"source": [
"df_image[\"blurred\"] = df_image[\"image\"].blob.image_blur(\n",
" (20, 20), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_transformed/\"\n",
" (20, 20), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_transformed/\", engine=\"opencv\"\n",
")\n",
"df_image[\"resized\"] = df_image[\"image\"].blob.image_resize(\n",
" (300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_resize_transformed/\"\n",
" (300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_resize_transformed/\", engine=\"opencv\"\n",
")\n",
"df_image[\"normalized\"] = df_image[\"image\"].blob.image_normalize(\n",
" alpha=50.0,\n",
" beta=150.0,\n",
" norm_type=\"minmax\",\n",
" dst=f\"gs://{OUTPUT_BUCKET}/image_normalize_transformed/\",\n",
" engine=\"opencv\",\n",
")"
]
},
Expand All @@ -280,7 +281,7 @@
"outputs": [],
"source": [
"# You can also chain functions together\n",
"df_image[\"blur_resized\"] = df_image[\"blurred\"].blob.image_resize((300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_resize_transformed/\")"
"df_image[\"blur_resized\"] = df_image[\"blurred\"].blob.image_resize((300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_resize_transformed/\", engine=\"opencv\")"
]
},
{
Expand Down Expand Up @@ -419,7 +420,7 @@
},
"outputs": [],
"source": [
"df_pdf[\"chunked\"] = df_pdf[\"pdf\"].blob.pdf_chunk()"
"df_pdf[\"chunked\"] = df_pdf[\"pdf\"].blob.pdf_chunk(engine=\"pypdf\")"
]
},
{
Expand Down
9 changes: 5 additions & 4 deletions samples/snippets/multimodal_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,21 +56,22 @@ def test_multimodal_dataframe(gcs_dst_bucket: str) -> None:

# [START bigquery_dataframes_multimodal_dataframe_image_transform]
df_image["blurred"] = df_image["image"].blob.image_blur(
(20, 20), dst=f"{dst_bucket}/image_blur_transformed/"
(20, 20), dst=f"{dst_bucket}/image_blur_transformed/", engine="opencv"
)
df_image["resized"] = df_image["image"].blob.image_resize(
(300, 200), dst=f"{dst_bucket}/image_resize_transformed/"
(300, 200), dst=f"{dst_bucket}/image_resize_transformed/", engine="opencv"
)
df_image["normalized"] = df_image["image"].blob.image_normalize(
alpha=50.0,
beta=150.0,
norm_type="minmax",
dst=f"{dst_bucket}/image_normalize_transformed/",
engine="opencv",
)

# You can also chain functions together
df_image["blur_resized"] = df_image["blurred"].blob.image_resize(
(300, 200), dst=f"{dst_bucket}/image_blur_resize_transformed/"
(300, 200), dst=f"{dst_bucket}/image_blur_resize_transformed/", engine="opencv"
)
df_image
# [END bigquery_dataframes_multimodal_dataframe_image_transform]
Expand Down Expand Up @@ -113,7 +114,7 @@ def test_multimodal_dataframe(gcs_dst_bucket: str) -> None:
df_pdf = bpd.from_glob_path(
"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*", name="pdf"
)
df_pdf["chunked"] = df_pdf["pdf"].blob.pdf_chunk()
df_pdf["chunked"] = df_pdf["pdf"].blob.pdf_chunk(engine="pypdf")
chunked = df_pdf["chunked"].explode()
chunked
# [END bigquery_dataframes_multimodal_dataframe_pdf_chunk]
Expand Down
40 changes: 29 additions & 11 deletions tests/system/large/blob/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@ def test_blob_exif(
connection=bq_connection,
)

actual = exif_image_df["blob_col"].blob.exif(connection=bq_connection)
actual = exif_image_df["blob_col"].blob.exif(
engine="pillow", connection=bq_connection
)
expected = bpd.Series(
['{"ExifOffset": 47, "Make": "MyCamera"}'],
session=session,
Expand All @@ -86,7 +88,7 @@ def test_blob_image_blur_to_series(
)

actual = images_mm_df["blob_col"].blob.image_blur(
(8, 8), dst=series, connection=bq_connection
(8, 8), dst=series, connection=bq_connection, engine="opencv"
)
expected_df = pd.DataFrame(
{
Expand Down Expand Up @@ -114,7 +116,7 @@ def test_blob_image_blur_to_folder(
images_output_uris: list[str],
):
actual = images_mm_df["blob_col"].blob.image_blur(
(8, 8), dst=images_output_folder, connection=bq_connection
(8, 8), dst=images_output_folder, connection=bq_connection, engine="opencv"
)
expected_df = pd.DataFrame(
{
Expand All @@ -136,7 +138,9 @@ def test_blob_image_blur_to_folder(


def test_blob_image_blur_to_bq(images_mm_df: bpd.DataFrame, bq_connection: str):
actual = images_mm_df["blob_col"].blob.image_blur((8, 8), connection=bq_connection)
actual = images_mm_df["blob_col"].blob.image_blur(
(8, 8), connection=bq_connection, engine="opencv"
)

assert isinstance(actual, bpd.Series)
assert len(actual) == 2
Expand All @@ -154,7 +158,7 @@ def test_blob_image_resize_to_series(
)

actual = images_mm_df["blob_col"].blob.image_resize(
(200, 300), dst=series, connection=bq_connection
(200, 300), dst=series, connection=bq_connection, engine="opencv"
)
expected_df = pd.DataFrame(
{
Expand Down Expand Up @@ -182,7 +186,7 @@ def test_blob_image_resize_to_folder(
images_output_uris: list[str],
):
actual = images_mm_df["blob_col"].blob.image_resize(
(200, 300), dst=images_output_folder, connection=bq_connection
(200, 300), dst=images_output_folder, connection=bq_connection, engine="opencv"
)
expected_df = pd.DataFrame(
{
Expand All @@ -205,7 +209,7 @@ def test_blob_image_resize_to_folder(

def test_blob_image_resize_to_bq(images_mm_df: bpd.DataFrame, bq_connection: str):
actual = images_mm_df["blob_col"].blob.image_resize(
(200, 300), connection=bq_connection
(200, 300), connection=bq_connection, engine="opencv"
)

assert isinstance(actual, bpd.Series)
Expand All @@ -224,7 +228,12 @@ def test_blob_image_normalize_to_series(
)

actual = images_mm_df["blob_col"].blob.image_normalize(
alpha=50.0, beta=150.0, norm_type="minmax", dst=series, connection=bq_connection
alpha=50.0,
beta=150.0,
norm_type="minmax",
dst=series,
connection=bq_connection,
engine="opencv",
)
expected_df = pd.DataFrame(
{
Expand Down Expand Up @@ -257,6 +266,7 @@ def test_blob_image_normalize_to_folder(
norm_type="minmax",
dst=images_output_folder,
connection=bq_connection,
engine="opencv",
)
expected_df = pd.DataFrame(
{
Expand All @@ -279,7 +289,11 @@ def test_blob_image_normalize_to_folder(

def test_blob_image_normalize_to_bq(images_mm_df: bpd.DataFrame, bq_connection: str):
actual = images_mm_df["blob_col"].blob.image_normalize(
alpha=50.0, beta=150.0, norm_type="minmax", connection=bq_connection
alpha=50.0,
beta=150.0,
norm_type="minmax",
connection=bq_connection,
engine="opencv",
)

assert isinstance(actual, bpd.Series)
Expand Down Expand Up @@ -322,7 +336,7 @@ def test_blob_pdf_extract(
):
actual = (
pdf_mm_df["pdf"]
.blob.pdf_extract(connection=bq_connection, verbose=verbose)
.blob.pdf_extract(connection=bq_connection, verbose=verbose, engine="pypdf")
.explode()
.to_pandas()
)
Expand Down Expand Up @@ -373,7 +387,11 @@ def test_blob_pdf_chunk(
actual = (
pdf_mm_df["pdf"]
.blob.pdf_chunk(
connection=bq_connection, chunk_size=50, overlap_size=10, verbose=verbose
connection=bq_connection,
chunk_size=50,
overlap_size=10,
verbose=verbose,
engine="pypdf",
)
.explode()
.to_pandas()
Expand Down