spark_domains#

Domains for Spark datatypes.

Data#

SparkColumnsDescriptor#

Mapping from column name to SparkColumnDescriptor.

Functions#

convert_spark_schema()

Returns mapping from column name to SparkColumnDescriptor.

convert_pandas_domain()

Returns a mapping from column name to SparkColumnDescriptor.

convert_numpy_domain()

Returns a SparkColumnDescriptor for a NumpyDomain.

convert_spark_schema(spark_schema)#

Returns mapping from column name to SparkColumnDescriptor.

Parameters

spark_schema (pyspark.sql.types.StructType) –

Return type

SparkColumnsDescriptor

convert_pandas_domain(pandas_domain)#

Returns a mapping from column name to SparkColumnDescriptor.

Parameters

pandas_domain (tmlt.core.domains.pandas_domains.PandasDataFrameDomain) –

Return type

SparkColumnsDescriptor

convert_numpy_domain(numpy_domain)#

Returns a SparkColumnDescriptor for a NumpyDomain.

Parameters

numpy_domain (tmlt.core.domains.numpy_domains.NumpyDomain) –

Return type

SparkColumnDescriptor

Classes#

SparkColumnDescriptor

Base class for describing Spark column types.

SparkIntegerColumnDescriptor

Describes an integer attribute in Spark.

SparkFloatColumnDescriptor

Describes a float attribute in Spark.

SparkStringColumnDescriptor

Describes a string attribute in Spark.

SparkDateColumnDescriptor

Describes a date attribute in Spark.

SparkTimestampColumnDescriptor

Describes a timestamp attribute in Spark.

SparkRowDomain

Domain of Spark DataFrame rows.

SparkDataFrameDomain

Domain of Spark DataFrames.

SparkGroupedDataFrameDomain

Domain of grouped DataFrames.

class SparkColumnDescriptor#

Bases: abc.ABC

Base class for describing Spark column types.

allow_null#

If True, null values are permitted in the domain.

abstract to_numpy_domain()#

Returns corresponding NumPy domain.

Return type

tmlt.core.domains.numpy_domains.NumpyDomain

validate_column(sdf, col_name)#

Raises error if not all values in given DataFrame column match descriptor.

Parameters
Return type

None

abstract valid_py_value(val)#

Returns True if val is valid for described Spark column.

Parameters

val (Any) –

Return type

bool

property data_type#

Returns data type associated with Spark column.

Return type

pyspark.sql.types.DataType

class SparkIntegerColumnDescriptor#

Bases: SparkColumnDescriptor

Describes an integer attribute in Spark.

SIZE_TO_TYPE#

Mapping from size to Spark type.

SIZE_TO_MIN_MAX#

Mapping from size to tuple of minimum and maximum value allowed.

allow_null :bool = False#

If True, null values are permitted in the domain.

size :int = 64#

Number of bits a member of the domain occupies. Must be 32 or 64.

to_numpy_domain()#

Returns corresponding NumPy domain.

Return type

tmlt.core.domains.numpy_domains.NumpyDomain

valid_py_value(val)#

Returns True if value is a valid python value for the descriptor.

Parameters

val (Any) –

Return type

bool

property data_type#

Returns data type associated with Spark column.

Return type

pyspark.sql.types.DataType

validate_column(sdf, col_name)#

Raises error if not all values in given DataFrame column match descriptor.

Parameters
Return type

None

class SparkFloatColumnDescriptor#

Bases: SparkColumnDescriptor

Describes a float attribute in Spark.

SIZE_TO_TYPE#

Mapping from size to Spark type.

allow_nan :bool = False#

If True, NaNs are permitted in the domain.

allow_inf :bool = False#

If True, infs are permitted in the domain.

allow_null :bool = False#

If True, null values are permitted in the domain.

Note

Nulls aren’t supported in pd.

size :int = 64#

Number of bits a member of the domain occupies. Must be 32 or 64.

to_numpy_domain()#

Returns corresponding NumPy domain.

Return type

tmlt.core.domains.numpy_domains.NumpyDomain

validate_column(sdf, col_name)#

Raises error if not all values in given DataFrame column match descriptor.

Parameters
Return type

None

valid_py_value(val)#

Returns True if value is a valid python value for the descriptor.

In particular, this returns True only if one of the following is true:

  • val is float(“nan”) and NaN is allowed.

  • val is float(“inf”) or float(“-inf”), and inf values are allowed.

  • val is a float that can be represented in size bits.

  • val is None and nulls are allowed in the domain.

Parameters

val (Any) –

Return type

bool

property data_type#

Returns data type associated with Spark column.

Return type

pyspark.sql.types.DataType

class SparkStringColumnDescriptor#

Bases: SparkColumnDescriptor

Describes a string attribute in Spark.

allow_null :bool = False#

If True, null values are permitted in the domain.

to_numpy_domain()#

Returns corresponding NumPy domain.

Return type

tmlt.core.domains.numpy_domains.NumpyStringDomain

valid_py_value(val)#

Returns True if value is a valid python value for the descriptor.

Parameters

val (Any) –

Return type

bool

property data_type#

Returns data type associated with Spark column.

Return type

pyspark.sql.types.DataType

validate_column(sdf, col_name)#

Raises error if not all values in given DataFrame column match descriptor.

Parameters
Return type

None

class SparkDateColumnDescriptor#

Bases: SparkColumnDescriptor

Describes a date attribute in Spark.

allow_null :bool = False#

If True, null values are permitted in the domain.

to_numpy_domain()#

Returns corresponding NumPy domain.

Note

Date types are not supported in NumPy; this method always raises an exception.

Return type

tmlt.core.domains.numpy_domains.NumpyDomain

valid_py_value(val)#

Returns True if the value is a valid Python value for the descriptor.

Parameters

val (Any) –

Return type

bool

property data_type#

Returns data type associated with Spark column.

Return type

pyspark.sql.types.DataType

validate_column(sdf, col_name)#

Raises error if not all values in given DataFrame column match descriptor.

Parameters
Return type

None

class SparkTimestampColumnDescriptor#

Bases: SparkColumnDescriptor

Describes a timestamp attribute in Spark.

allow_null :bool = False#

If True, null values are permitted in the domain.

to_numpy_domain()#

Returns corresponding NumPy domain.

Note

Timestamp types are not supported in NumPy; this method always raises an exception.

Return type

tmlt.core.domains.numpy_domains.NumpyDomain

valid_py_value(val)#

Returns True if the value is a valid Python value for the descriptor.

Parameters

val (Any) –

Return type

bool

property data_type#

Returns data type associated with Spark column.

Return type

pyspark.sql.types.DataType

validate_column(sdf, col_name)#

Raises error if not all values in given DataFrame column match descriptor.

Parameters
Return type

None

class SparkRowDomain(schema)#

Bases: tmlt.core.domains.base.Domain

Domain of Spark DataFrame rows.

Parameters

schema (SparkColumnsDescriptor) –

__init__(schema)#

Constructor.

Parameters

schema (MappingMapping[str, SparkColumnDescriptor]) – Mapping from column names to column descriptors.

property schema#

Returns mapping from column names to column descriptors.

Return type

SparkColumnsDescriptor

abstract validate(value)#

Raises error if value is not a row with matching schema.

Parameters

value (Any) –

Return type

None

abstract __contains__(value)#

Returns True if value is a row with matching schema.

Parameters

value (Any) –

Return type

bool

__eq__(other)#

Return True if the classes are equivalent.

Parameters

other (Any) –

Return type

bool

property carrier_type#

Returns carrier types for members of SparkRowDomain.

Return type

type

class SparkDataFrameDomain(schema)#

Bases: tmlt.core.domains.base.Domain

Domain of Spark DataFrames.

Parameters

schema (SparkColumnsDescriptor) –

__init__(schema)#

Constructor.

Parameters

schema (MappingMapping[str, SparkColumnDescriptor]) – Mapping from column names to column descriptors.

property schema#

Returns mapping from column names to column descriptors.

Return type

SparkColumnsDescriptor

validate(value)#

Raises error if value is not a DataFrame with matching schema.

Parameters

value (Any) –

Return type

None

__eq__(other)#

Return True if the classes are equivalent.

Parameters

other (Any) –

Return type

bool

property carrier_type#

Returns carrier type for the domain.

Return type

type

__getitem__(col_name)#

Returns column descriptor for given column.

Parameters

col_name (str) –

Return type

SparkColumnDescriptor

classmethod from_spark_schema(schema)#

Returns a SparkDataFrameDomain constructed from a Spark schema.

Note

If schema contains float types, nans and infs are allowed since the schema places no restrictions on these.

Parameters

schema (pyspark.sql.types.StructType) – Spark schema for constructing domain.

Return type

SparkDataFrameDomain

property spark_schema#

Returns Spark schema object according to domain.

Note

There isn’t a one-to-one correspondence between Spark schema objects and SparkDataFrameDomain objects since the domains encode additional information about allowing nans or infs in float columns. Other information may get added in the future and these cannot be represented with the Spark schema (StructType) object.

Return type

pyspark.sql.types.StructType

project(cols)#

Project this domain to a subset of columns.

The column ordering of the schema is used if it differs from the input ordering.

Parameters

cols (Sequence[str]) –

Return type

SparkDataFrameDomain

__contains__(value)#

Returns True if value is in the domain.

Parameters

value (Any) –

Return type

bool

class SparkGroupedDataFrameDomain(schema, groupby_columns)#

Bases: tmlt.core.domains.base.Domain

Domain of grouped DataFrames.

Parameters
  • schema (SparkColumnsDescriptor) –

  • groupby_columns (Sequence[str]) –

__init__(schema, groupby_columns)#

Constructor.

Parameters
property schema#

Returns mapping from column names to column descriptors.

Return type

SparkColumnsDescriptor

property groupby_columns#

Returns list of columns used for grouping.

Return type

List[str]

property carrier_type#

Returns carrier type for the domain.

Return type

type

property spark_schema#

Returns Spark schema object according to domain.

Note

There isn’t a one-to-one correspondence between Spark schema objects and SparkDataFrameDomain objects since the domains encode additional information about allowing nans or infs in float columns. Other information may get added in the future and these cannot be represented with the Spark schema (StructType) object.

Return type

pyspark.sql.types.StructType

validate(value)#

Raises error if value is not a GroupedDataFrame with matching group_keys.

Parameters

value (Any) –

Return type

None

get_group_domain()#

Return the domain for one of the groups.

Return type

SparkDataFrameDomain

__eq__(other)#

Return True if the schemas and group keys are identical.

Parameters

other (Any) –

Return type

bool

__getitem__(col_name)#

Returns column descriptor for given column.

Parameters

col_name (str) –

Return type

SparkColumnDescriptor

__contains__(value)#

Returns True if value is in the domain.

Parameters

value (Any) –

Return type

bool