Skip to content

docs: add type mapping tables between PyIceberg and PyArrow#3098

Merged
kevinjqliu merged 5 commits intoapache:mainfrom
iamluan:docs-2226-typemapping-pyiceberg-pyarrow
Mar 17, 2026
Merged

docs: add type mapping tables between PyIceberg and PyArrow#3098
kevinjqliu merged 5 commits intoapache:mainfrom
iamluan:docs-2226-typemapping-pyiceberg-pyarrow

Conversation

@iamluan
Copy link
Contributor

@iamluan iamluan commented Feb 25, 2026

Closes #2226

Rationale for this change

This PR adds documentation with tables describing the type mapping between PyArrow and PyIceberg data types.

Are these changes tested?

Yes.
The changes are tested locally as shown in the image below.
image

Are there any user-facing changes?

Yes.
This PR adds new user-facing documentation.

@kevinjqliu
Copy link
Contributor

this is great, thank you!
im not a big fan of documenting using python files. Could you add it as a markdown file instead? similar to #2480

Perhaps we can add it to the API section https://py.iceberg.apache.org/api/

@iamluan iamluan force-pushed the docs-2226-typemapping-pyiceberg-pyarrow branch from 9e80178 to 26b12e0 Compare February 26, 2026 19:44
@iamluan
Copy link
Contributor Author

iamluan commented Feb 26, 2026

Thank you for your review. I have added the markdown to the API section.
image

@iamluan iamluan force-pushed the docs-2226-typemapping-pyiceberg-pyarrow branch from 26b12e0 to ae85ac9 Compare March 10, 2026 10:34
Copy link
Contributor

@kevinjqliu kevinjqliu left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM! I added a few changes with the help of claude.
Mainly

  • marked v3 Iceberg types
  • expand eligible pyarrow types
  • add numbered references to notes
Screenshot 2026-03-17 at 10 01 38 AM Screenshot 2026-03-17 at 10 01 50 AM Screenshot 2026-03-17 at 10 02 02 AM

import pyarrow as pa
```

#### PyIceberg to PyArrow type mapping
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Code reference:

class _ConvertToArrowSchema(SchemaVisitorPerPrimitiveType[pa.DataType]):
_metadata: dict[bytes, bytes]
def __init__(
self, metadata: dict[bytes, bytes] = EMPTY_DICT, include_field_ids: bool = True, file_format: FileFormat | None = None
) -> None:
self._metadata = metadata
self._include_field_ids = include_field_ids
self._file_format = file_format
def schema(self, _: Schema, struct_result: pa.StructType) -> pa.schema:
return pa.schema(list(struct_result), metadata=self._metadata)
def struct(self, _: StructType, field_results: builtins.list[pa.DataType]) -> pa.DataType:
return pa.struct(field_results)
def field(self, field: NestedField, field_result: pa.DataType) -> pa.Field:
metadata = {}
if field.doc:
metadata[PYARROW_FIELD_DOC_KEY] = field.doc
if self._include_field_ids:
# Add field ID based on file format
if self._file_format == FileFormat.ORC:
metadata[ORC_FIELD_ID_KEY] = str(field.field_id)
else:
# Default to Parquet for backward compatibility
metadata[PYARROW_PARQUET_FIELD_ID_KEY] = str(field.field_id)
if self._file_format == FileFormat.ORC:
metadata[ORC_FIELD_REQUIRED_KEY] = str(field.required).lower()
return pa.field(
name=field.name,
type=field_result,
nullable=field.optional,
metadata=metadata,
)
def list(self, list_type: ListType, element_result: pa.DataType) -> pa.DataType:
element_field = self.field(list_type.element_field, element_result)
return pa.large_list(value_type=element_field)
def map(self, map_type: MapType, key_result: pa.DataType, value_result: pa.DataType) -> pa.DataType:
key_field = self.field(map_type.key_field, key_result)
value_field = self.field(map_type.value_field, value_result)
return pa.map_(key_type=key_field, item_type=value_field)
def visit_fixed(self, fixed_type: FixedType) -> pa.DataType:
return pa.binary(len(fixed_type))
def visit_decimal(self, decimal_type: DecimalType) -> pa.DataType:
# It looks like decimal{32,64} is not fully implemented:
# https://github.com/apache/arrow/issues/25483
# https://github.com/apache/arrow/issues/43956
# However, if we keep it as 128 in memory, and based on the
# precision/scale Arrow will map it to INT{32,64}
# https://github.com/apache/arrow/blob/598938711a8376cbfdceaf5c77ab0fd5057e6c02/cpp/src/parquet/arrow/schema.cc#L380-L392
return pa.decimal128(decimal_type.precision, decimal_type.scale)
def visit_boolean(self, _: BooleanType) -> pa.DataType:
return pa.bool_()
def visit_integer(self, _: IntegerType) -> pa.DataType:
return pa.int32()
def visit_long(self, _: LongType) -> pa.DataType:
return pa.int64()
def visit_float(self, _: FloatType) -> pa.DataType:
# 32-bit IEEE 754 floating point
return pa.float32()
def visit_double(self, _: DoubleType) -> pa.DataType:
# 64-bit IEEE 754 floating point
return pa.float64()
def visit_date(self, _: DateType) -> pa.DataType:
# Date encoded as an int
return pa.date32()
def visit_time(self, _: TimeType) -> pa.DataType:
return pa.time64("us")
def visit_timestamp(self, _: TimestampType) -> pa.DataType:
return pa.timestamp(unit="us")
def visit_timestamp_ns(self, _: TimestampNanoType) -> pa.DataType:
return pa.timestamp(unit="ns")
def visit_timestamptz(self, _: TimestamptzType) -> pa.DataType:
return pa.timestamp(unit="us", tz="UTC")
def visit_timestamptz_ns(self, _: TimestamptzNanoType) -> pa.DataType:
return pa.timestamp(unit="ns", tz="UTC")
def visit_string(self, _: StringType) -> pa.DataType:
return pa.large_string()
def visit_uuid(self, _: UUIDType) -> pa.DataType:
return pa.uuid()
def visit_unknown(self, _: UnknownType) -> pa.DataType:
"""Type `UnknownType` can be promoted to any primitive type in V3+ tables per the Iceberg spec."""
return pa.null()
def visit_binary(self, _: BinaryType) -> pa.DataType:
return pa.large_binary()
def visit_geometry(self, geometry_type: GeometryType) -> pa.DataType:
"""Convert geometry type to PyArrow type.
When geoarrow-pyarrow is available, returns a GeoArrow WKB extension type
with CRS metadata. Otherwise, falls back to large_binary which stores WKB bytes.
"""
try:
import geoarrow.pyarrow as ga
return ga.wkb().with_crs(geometry_type.crs)
except ImportError:
return pa.large_binary()
def visit_geography(self, geography_type: GeographyType) -> pa.DataType:
"""Convert geography type to PyArrow type.
When geoarrow-pyarrow is available, returns a GeoArrow WKB extension type
with CRS and edge type metadata. Otherwise, falls back to large_binary which stores WKB bytes.
"""
try:
import geoarrow.pyarrow as ga
wkb_type = ga.wkb().with_crs(geography_type.crs)
# Map Iceberg algorithm to GeoArrow edge type
if geography_type.algorithm == "spherical":
wkb_type = wkb_type.with_edge_type(ga.EdgeType.SPHERICAL)
# "planar" is the default edge type in GeoArrow, no need to set explicitly
return wkb_type
except ImportError:
return pa.large_binary()


---

#### PyArrow to PyIceberg type mapping
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Code reference:

class _ConvertToIceberg(PyArrowSchemaVisitor[IcebergType | Schema]):
"""Converts PyArrowSchema to Iceberg Schema. Applies the IDs from name_mapping if provided."""
_field_names: builtins.list[str]
def __init__(
self, downcast_ns_timestamp_to_us: bool = False, format_version: TableVersion = TableProperties.DEFAULT_FORMAT_VERSION
) -> None: # noqa: F821
self._field_names = []
self._downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us
self._format_version = format_version
def _field_id(self, field: pa.Field) -> int:
if (field_id := _get_field_id(field)) is not None:
return field_id
else:
raise ValueError(f"Cannot convert {field} to Iceberg Field as field_id is empty.")
def schema(self, schema: pa.Schema, struct_result: StructType) -> Schema:
return Schema(*struct_result.fields)
def struct(self, struct: pa.StructType, field_results: builtins.list[NestedField]) -> StructType:
return StructType(*field_results)
def field(self, field: pa.Field, field_result: IcebergType) -> NestedField:
field_id = self._field_id(field)
field_doc = doc_str.decode() if (field.metadata and (doc_str := field.metadata.get(PYARROW_FIELD_DOC_KEY))) else None
field_type = field_result
return NestedField(field_id, field.name, field_type, required=not field.nullable, doc=field_doc)
def list(self, list_type: pa.ListType, element_result: IcebergType) -> ListType:
element_field = list_type.value_field
self._field_names.append(LIST_ELEMENT_NAME)
element_id = self._field_id(element_field)
self._field_names.pop()
return ListType(element_id, element_result, element_required=not element_field.nullable)
def map(self, map_type: pa.MapType, key_result: IcebergType, value_result: IcebergType) -> MapType:
key_field = map_type.key_field
self._field_names.append(MAP_KEY_NAME)
key_id = self._field_id(key_field)
self._field_names.pop()
value_field = map_type.item_field
self._field_names.append(MAP_VALUE_NAME)
value_id = self._field_id(value_field)
self._field_names.pop()
return MapType(key_id, key_result, value_id, value_result, value_required=not value_field.nullable)
def primitive(self, primitive: pa.DataType) -> PrimitiveType:
if pa.types.is_boolean(primitive):
return BooleanType()
elif pa.types.is_integer(primitive):
width = primitive.bit_width
if width <= 32:
return IntegerType()
elif width <= 64:
return LongType()
else:
# Does not exist (yet)
raise TypeError(f"Unsupported integer type: {primitive}")
elif pa.types.is_float32(primitive):
return FloatType()
elif pa.types.is_float64(primitive):
return DoubleType()
elif isinstance(primitive, pa.Decimal128Type):
primitive = cast(pa.Decimal128Type, primitive)
return DecimalType(primitive.precision, primitive.scale)
elif pa.types.is_string(primitive) or pa.types.is_large_string(primitive) or pa.types.is_string_view(primitive):
return StringType()
elif pa.types.is_date32(primitive):
return DateType()
elif isinstance(primitive, pa.Time64Type) and primitive.unit == "us":
return TimeType()
elif pa.types.is_timestamp(primitive):
primitive = cast(pa.TimestampType, primitive)
if primitive.unit in ("s", "ms", "us"):
# Supported types, will be upcast automatically to 'us'
pass
elif primitive.unit == "ns":
if self._downcast_ns_timestamp_to_us:
logger.warning("Iceberg does not yet support 'ns' timestamp precision. Downcasting to 'us'.")
elif self._format_version >= 3:
if primitive.tz in UTC_ALIASES:
return TimestamptzNanoType()
elif primitive.tz is None:
return TimestampNanoType()
else:
raise TypeError(
"Iceberg does not yet support 'ns' timestamp precision. "
"Use 'downcast-ns-timestamp-to-us-on-write' configuration property to automatically "
"downcast 'ns' to 'us' on write.",
)
else:
raise TypeError(f"Unsupported precision for timestamp type: {primitive.unit}")
if primitive.tz in UTC_ALIASES:
return TimestamptzType()
elif primitive.tz is None:
return TimestampType()
elif pa.types.is_binary(primitive) or pa.types.is_large_binary(primitive) or pa.types.is_binary_view(primitive):
return BinaryType()
elif pa.types.is_fixed_size_binary(primitive):
primitive = cast(pa.FixedSizeBinaryType, primitive)
return FixedType(primitive.byte_width)
elif pa.types.is_null(primitive):
# PyArrow null type (pa.null()) is converted to Iceberg UnknownType
# UnknownType can be promoted to any primitive type in V3+ tables per the Iceberg spec
if self._format_version < 3:
field_path = ".".join(self._field_names) if self._field_names else "<root>"
raise ValueError(
"Null type (pa.null()) is not supported in Iceberg format version "
f"{self._format_version}. Field: {field_path}. "
"Requires format-version=3+ or use a concrete type (string, int, boolean, etc.)."
)
return UnknownType()
elif isinstance(primitive, pa.UuidType):
return UUIDType()
raise TypeError(f"Unsupported type: {primitive}")
def before_field(self, field: pa.Field) -> None:
self._field_names.append(field.name)
def after_field(self, field: pa.Field) -> None:
self._field_names.pop()
def before_list_element(self, element: pa.Field) -> None:
self._field_names.append(LIST_ELEMENT_NAME)
def after_list_element(self, element: pa.Field) -> None:
self._field_names.pop()
def before_map_key(self, key: pa.Field) -> None:
self._field_names.append(MAP_KEY_NAME)
def after_map_key(self, element: pa.Field) -> None:
self._field_names.pop()
def before_map_value(self, value: pa.Field) -> None:
self._field_names.append(MAP_VALUE_NAME)
def after_map_value(self, element: pa.Field) -> None:
self._field_names.pop()

@kevinjqliu kevinjqliu merged commit 4a8c84e into apache:main Mar 17, 2026
5 checks passed
@kevinjqliu
Copy link
Contributor

Thanks for the PR @iamluan and thanks for the review @nssalian

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

docs: add a table for data type conversion between arrow and iceberg types

3 participants