data_structures Module API

This section documents the core data structures used throughout datashard.

Enums

FileFormat

class datashard.FileFormat(value)[source]

Bases: Enum

Supported file formats

PARQUET = 'parquet'
AVRO = 'avro'
ORC = 'orc'

ManifestContent

Classes

Schema

class datashard.Schema(schema_id: int, fields: List[Dict[str, Any]], schema_string: str = '')[source]

Bases: object

Table schema definition

schema_id: int
fields: List[Dict[str, Any]]
schema_string: str = ''
__init__(schema_id: int, fields: List[Dict[str, Any]], schema_string: str = '') None

PartitionField

PartitionSpec

class datashard.PartitionSpec(spec_id: int, fields: List[PartitionField])[source]

Bases: object

Partition specification

spec_id: int
fields: List[PartitionField]
__init__(spec_id: int, fields: List[PartitionField]) None

SortField

SortOrder

class datashard.SortOrder(order_id: int, fields: List[SortField])[source]

Bases: object

Sort order specification

order_id: int
fields: List[SortField]
__init__(order_id: int, fields: List[SortField]) None

DataFile

class datashard.DataFile(file_path: str, file_format: FileFormat, partition_values: Dict[str, Any], record_count: int, file_size_in_bytes: int, column_sizes: Dict[int, int] | None = None, value_counts: Dict[int, int] | None = None, null_value_counts: Dict[int, int] | None = None, lower_bounds: Dict[int, Any] | None = None, upper_bounds: Dict[int, Any] | None = None, key_metadata: bytes | None = None, checksum: str | None = None, split_offsets: List[int] | None = None, split_compressed_offsets: List[int] | None = None, equality_ids: List[int] | None = None, sort_order_id: int | None = None)[source]

Bases: object

Represents a data file in the table

file_path: str
file_format: FileFormat
partition_values: Dict[str, Any]
record_count: int
file_size_in_bytes: int
column_sizes: Dict[int, int] | None = None
value_counts: Dict[int, int] | None = None
null_value_counts: Dict[int, int] | None = None
lower_bounds: Dict[int, Any] | None = None
upper_bounds: Dict[int, Any] | None = None
key_metadata: bytes | None = None
checksum: str | None = None
split_offsets: List[int] | None = None
split_compressed_offsets: List[int] | None = None
equality_ids: List[int] | None = None
sort_order_id: int | None = None
__init__(file_path: str, file_format: FileFormat, partition_values: Dict[str, Any], record_count: int, file_size_in_bytes: int, column_sizes: Dict[int, int] | None = None, value_counts: Dict[int, int] | None = None, null_value_counts: Dict[int, int] | None = None, lower_bounds: Dict[int, Any] | None = None, upper_bounds: Dict[int, Any] | None = None, key_metadata: bytes | None = None, checksum: str | None = None, split_offsets: List[int] | None = None, split_compressed_offsets: List[int] | None = None, equality_ids: List[int] | None = None, sort_order_id: int | None = None) None

DeleteFile

class datashard.DeleteFile(file_path: str, file_format: FileFormat, partition_values: Dict[str, Any], record_count: int, file_size_in_bytes: int, content: ManifestContent = ManifestContent.DELETES)[source]

Bases: object

Represents a delete file in the table

file_path: str
file_format: FileFormat
partition_values: Dict[str, Any]
record_count: int
file_size_in_bytes: int
content: ManifestContent = 1
__init__(file_path: str, file_format: FileFormat, partition_values: Dict[str, Any], record_count: int, file_size_in_bytes: int, content: ManifestContent = ManifestContent.DELETES) None

ManifestFile

class datashard.ManifestFile(manifest_path: str, manifest_length: int, partition_spec_id: int, added_snapshot_id: int, added_data_files_count: int, existing_data_files_count: int, deleted_data_files_count: int, partitions: List[Dict[str, Any]], content: ManifestContent = ManifestContent.DATA, sequence_number: int | None = None, min_sequence_number: int | None = None)[source]

Bases: object

Manifest file that lists data or delete files

manifest_path: str
manifest_length: int
partition_spec_id: int
added_snapshot_id: int
added_data_files_count: int
existing_data_files_count: int
deleted_data_files_count: int
partitions: List[Dict[str, Any]]
content: ManifestContent = 0
sequence_number: int | None = None
min_sequence_number: int | None = None
__init__(manifest_path: str, manifest_length: int, partition_spec_id: int, added_snapshot_id: int, added_data_files_count: int, existing_data_files_count: int, deleted_data_files_count: int, partitions: List[Dict[str, Any]], content: ManifestContent = ManifestContent.DATA, sequence_number: int | None = None, min_sequence_number: int | None = None) None

Snapshot

class datashard.Snapshot(snapshot_id: int, timestamp_ms: int, manifest_list: str, parent_snapshot_id: int | None = None, operation: str | None = None, summary: Dict[str, str] | None = None, schema_id: int | None = None)[source]

Bases: object

Represents a snapshot of the table at a point in time

snapshot_id: int
timestamp_ms: int
manifest_list: str
parent_snapshot_id: int | None = None
operation: str | None = None
summary: Dict[str, str] | None = None
schema_id: int | None = None
__init__(snapshot_id: int, timestamp_ms: int, manifest_list: str, parent_snapshot_id: int | None = None, operation: str | None = None, summary: Dict[str, str] | None = None, schema_id: int | None = None) None

HistoryEntry

TableMetadata

class datashard.TableMetadata(location: str, table_uuid: str = <factory>, format_version: int = 2, last_sequence_number: int = 0, last_updated_ms: int = <factory>, last_column_id: int = 0, schemas: ~typing.List[~datashard.data_structures.Schema] = <factory>, current_schema_id: int = 0, partition_specs: ~typing.List[~datashard.data_structures.PartitionSpec] = <factory>, default_spec_id: int = 0, sort_orders: ~typing.List[~datashard.data_structures.SortOrder] = <factory>, default_sort_order_id: int = 1, properties: ~typing.Dict[str, str] = <factory>, current_snapshot_id: int | None = None, snapshots: ~typing.List[~datashard.data_structures.Snapshot] = <factory>, snapshot_log: ~typing.List[~datashard.data_structures.HistoryEntry] = <factory>, metadata_log: ~typing.List[~typing.Dict[str, ~typing.Any]] = <factory>)[source]

Bases: object

Main metadata structure for an Iceberg table

location: str
table_uuid: str
format_version: int = 2
last_sequence_number: int = 0
last_updated_ms: int
last_column_id: int = 0
schemas: List[Schema]
current_schema_id: int = 0
partition_specs: List[PartitionSpec]
default_spec_id: int = 0
sort_orders: List[SortOrder]
default_sort_order_id: int = 1
properties: Dict[str, str]
current_snapshot_id: int | None = None
snapshots: List[Snapshot]
snapshot_log: List[HistoryEntry]
metadata_log: List[Dict[str, Any]]
__init__(location: str, table_uuid: str = <factory>, format_version: int = 2, last_sequence_number: int = 0, last_updated_ms: int = <factory>, last_column_id: int = 0, schemas: ~typing.List[~datashard.data_structures.Schema] = <factory>, current_schema_id: int = 0, partition_specs: ~typing.List[~datashard.data_structures.PartitionSpec] = <factory>, default_spec_id: int = 0, sort_orders: ~typing.List[~datashard.data_structures.SortOrder] = <factory>, default_sort_order_id: int = 1, properties: ~typing.Dict[str, str] = <factory>, current_snapshot_id: int | None = None, snapshots: ~typing.List[~datashard.data_structures.Snapshot] = <factory>, snapshot_log: ~typing.List[~datashard.data_structures.HistoryEntry] = <factory>, metadata_log: ~typing.List[~typing.Dict[str, ~typing.Any]] = <factory>) None