spark_domains#

Domains for Spark datatypes.

Data#

SparkColumnsDescriptor#

Mapping from column name to SparkColumnDescriptor.

Functions#

convert_spark_schema()

Returns mapping from column name to SparkColumnDescriptor.

convert_pandas_domain()

Returns a mapping from column name to SparkColumnDescriptor.

convert_numpy_domain()

Returns a SparkColumnDescriptor for a NumpyDomain.

convert_spark_schema(spark_schema)#

Returns mapping from column name to SparkColumnDescriptor.

Parameters:

spark_schema (pyspark.sql.types.StructType)

Return type:

SparkColumnsDescriptor

convert_pandas_domain(pandas_domain)#

Returns a mapping from column name to SparkColumnDescriptor.

Parameters:

pandas_domain (tmlt.core.domains.pandas_domains.PandasDataFrameDomain)

Return type:

SparkColumnsDescriptor

convert_numpy_domain(numpy_domain)#

Returns a SparkColumnDescriptor for a NumpyDomain.

Parameters:

numpy_domain (tmlt.core.domains.numpy_domains.NumpyDomain)

Return type:

SparkColumnDescriptor

Classes#

SparkColumnDescriptor

Base class for describing Spark column types.

SparkIntegerColumnDescriptor

Describes an integer attribute in Spark.

SparkFloatColumnDescriptor

Describes a float attribute in Spark.

SparkStringColumnDescriptor

Describes a string attribute in Spark.

SparkDateColumnDescriptor

Describes a date attribute in Spark.

SparkTimestampColumnDescriptor

Describes a timestamp attribute in Spark.

SparkRowDomain

Domain of Spark DataFrame rows.

SparkDataFrameDomain

Domain of Spark DataFrames.

SparkGroupedDataFrameDomain

Domain of grouped DataFrames.

class SparkColumnDescriptor#

Bases: abc.ABC

Base class for describing Spark column types.

allow_null#

If True, null values are permitted in the domain.

property data_type: pyspark.sql.types.DataType#
Abstractmethod:

Return type:

pyspark.sql.types.DataType

Returns data type associated with Spark column.

abstract to_numpy_domain()#

Returns corresponding NumPy domain.

Return type:

tmlt.core.domains.numpy_domains.NumpyDomain

validate_column(sdf, col_name)#

Raises error if not all values in given DataFrame column match descriptor.

Parameters:
Return type:

None

abstract valid_py_value(val)#

Returns True if val is valid for described Spark column.

Parameters:

val (Any)

Return type:

bool

class SparkIntegerColumnDescriptor#

Bases: SparkColumnDescriptor

Describes an integer attribute in Spark.

SIZE_TO_TYPE#

Mapping from size to Spark type.

SIZE_TO_MIN_MAX#

Mapping from size to tuple of minimum and maximum value allowed.

allow_null: bool = False#

If True, null values are permitted in the domain.

size: int = 64#

Number of bits a member of the domain occupies. Must be 32 or 64.

property data_type: pyspark.sql.types.DataType#

Returns data type associated with Spark column.

Return type:

pyspark.sql.types.DataType

to_numpy_domain()#

Returns corresponding NumPy domain.

Return type:

tmlt.core.domains.numpy_domains.NumpyDomain

valid_py_value(val)#

Returns True if value is a valid python value for the descriptor.

Parameters:

val (Any)

Return type:

bool

validate_column(sdf, col_name)#

Raises error if not all values in given DataFrame column match descriptor.

Parameters:
Return type:

None

class SparkFloatColumnDescriptor#

Bases: SparkColumnDescriptor

Describes a float attribute in Spark.

SIZE_TO_TYPE#

Mapping from size to Spark type.

allow_nan: bool = False#

If True, NaNs are permitted in the domain.

allow_inf: bool = False#

If True, infs are permitted in the domain.

allow_null: bool = False#

If True, null values are permitted in the domain.

Note

Nulls aren’t supported in pd.

size: int = 64#

Number of bits a member of the domain occupies. Must be 32 or 64.

property data_type: pyspark.sql.types.DataType#

Returns data type associated with Spark column.

Return type:

pyspark.sql.types.DataType

to_numpy_domain()#

Returns corresponding NumPy domain.

Return type:

tmlt.core.domains.numpy_domains.NumpyDomain

validate_column(sdf, col_name)#

Raises error if not all values in given DataFrame column match descriptor.

Parameters:
Return type:

None

valid_py_value(val)#

Returns True if value is a valid python value for the descriptor.

In particular, this returns True only if one of the following is true:

  • val is float("nan") and NaN is allowed.

  • val is float("inf") or float("-inf"), and inf values are allowed.

  • val is a float that can be represented in size bits.

  • val is None and nulls are allowed in the domain.

Parameters:

val (Any)

Return type:

bool

class SparkStringColumnDescriptor#

Bases: SparkColumnDescriptor

Describes a string attribute in Spark.

allow_null: bool = False#

If True, null values are permitted in the domain.

property data_type: pyspark.sql.types.DataType#

Returns data type associated with Spark column.

Return type:

pyspark.sql.types.DataType

to_numpy_domain()#

Returns corresponding NumPy domain.

Return type:

tmlt.core.domains.numpy_domains.NumpyStringDomain

valid_py_value(val)#

Returns True if value is a valid python value for the descriptor.

Parameters:

val (Any)

Return type:

bool

validate_column(sdf, col_name)#

Raises error if not all values in given DataFrame column match descriptor.

Parameters:
Return type:

None

class SparkDateColumnDescriptor#

Bases: SparkColumnDescriptor

Describes a date attribute in Spark.

allow_null: bool = False#

If True, null values are permitted in the domain.

property data_type: pyspark.sql.types.DataType#

Returns data type associated with Spark column.

Return type:

pyspark.sql.types.DataType

to_numpy_domain()#

Returns corresponding NumPy domain.

Note

Date types are not supported in NumPy; this method always raises an exception.

Return type:

tmlt.core.domains.numpy_domains.NumpyDomain

valid_py_value(val)#

Returns True if the value is a valid Python value for the descriptor.

Parameters:

val (Any)

Return type:

bool

validate_column(sdf, col_name)#

Raises error if not all values in given DataFrame column match descriptor.

Parameters:
Return type:

None

class SparkTimestampColumnDescriptor#

Bases: SparkColumnDescriptor

Describes a timestamp attribute in Spark.

allow_null: bool = False#

If True, null values are permitted in the domain.

property data_type: pyspark.sql.types.DataType#

Returns data type associated with Spark column.

Return type:

pyspark.sql.types.DataType

to_numpy_domain()#

Returns corresponding NumPy domain.

Note

Timestamp types are not supported in NumPy; this method always raises an exception.

Return type:

tmlt.core.domains.numpy_domains.NumpyDomain

valid_py_value(val)#

Returns True if the value is a valid Python value for the descriptor.

Parameters:

val (Any)

Return type:

bool

validate_column(sdf, col_name)#

Raises error if not all values in given DataFrame column match descriptor.

Parameters:
Return type:

None

class SparkRowDomain(schema)#

Bases: tmlt.core.domains.base.Domain

Domain of Spark DataFrame rows.

Parameters:

schema (SparkColumnsDescriptor)

property schema: SparkColumnsDescriptor#

Returns mapping from column names to column descriptors.

Return type:

SparkColumnsDescriptor

property carrier_type: type#

Returns carrier types for members of SparkRowDomain.

Return type:

type

__init__(schema)#

Constructor.

Parameters:

schema (Mapping[str, SparkColumnDescriptor]) – Mapping from column names to column descriptors.

abstract validate(value)#

Raises error if value is not a row with matching schema.

Parameters:

value (Any)

Return type:

None

abstract __contains__(value)#

Returns True if value is a row with matching schema.

Parameters:

value (Any)

Return type:

bool

__eq__(other)#

Return True if the classes are equivalent.

Parameters:

other (Any)

Return type:

bool

class SparkDataFrameDomain(schema)#

Bases: tmlt.core.domains.base.Domain

Domain of Spark DataFrames.

Parameters:

schema (SparkColumnsDescriptor)

property schema: SparkColumnsDescriptor#

Returns mapping from column names to column descriptors.

Return type:

SparkColumnsDescriptor

property carrier_type: type#

Returns carrier type for the domain.

Return type:

type

property spark_schema: pyspark.sql.types.StructType#

Returns Spark schema object according to domain.

Note

There isn’t a one-to-one correspondence between Spark schema objects and SparkDataFrameDomain objects since the domains encode additional information about allowing nans or infs in float columns. Other information may get added in the future and these cannot be represented with the Spark schema (StructType) object.

Return type:

pyspark.sql.types.StructType

__init__(schema)#

Constructor.

Parameters:

schema (Mapping[str, SparkColumnDescriptor]) – Mapping from column names to column descriptors.

validate(value)#

Raises error if value is not a DataFrame with matching schema.

Parameters:

value (Any)

Return type:

None

__eq__(other)#

Return True if the classes are equivalent.

Parameters:

other (Any)

Return type:

bool

__getitem__(col_name)#

Returns column descriptor for given column.

Parameters:

col_name (str)

Return type:

SparkColumnDescriptor

classmethod from_spark_schema(schema)#

Returns a SparkDataFrameDomain constructed from a Spark schema.

Note

If schema contains float types, nans and infs are allowed since the schema places no restrictions on these.

Parameters:

schema (pyspark.sql.types.StructType) – Spark schema for constructing domain.

Return type:

SparkDataFrameDomain

project(cols)#

Project this domain to a subset of columns.

The column ordering of the schema is used if it differs from the input ordering.

Parameters:

cols (Sequence[str])

Return type:

SparkDataFrameDomain

__contains__(value)#

Returns True if value is in the domain.

Parameters:

value (Any)

Return type:

bool

class SparkGroupedDataFrameDomain(schema, groupby_columns)#

Bases: tmlt.core.domains.base.Domain

Domain of grouped DataFrames.

Parameters:
  • schema (SparkColumnsDescriptor)

  • groupby_columns (Sequence[str])

property schema: SparkColumnsDescriptor#

Returns mapping from column names to column descriptors.

Return type:

SparkColumnsDescriptor

property groupby_columns: List[str]#

Returns list of columns used for grouping.

Return type:

List[str]

property carrier_type: type#

Returns carrier type for the domain.

Return type:

type

property spark_schema: pyspark.sql.types.StructType#

Returns Spark schema object according to domain.

Note

There isn’t a one-to-one correspondence between Spark schema objects and SparkDataFrameDomain objects since the domains encode additional information about allowing nans or infs in float columns. Other information may get added in the future and these cannot be represented with the Spark schema (StructType) object.

Return type:

pyspark.sql.types.StructType

__init__(schema, groupby_columns)#

Constructor.

Parameters:
validate(value)#

Raises error if value is not a GroupedDataFrame with matching group_keys.

Parameters:

value (Any)

Return type:

None

get_group_domain()#

Return the domain for one of the groups.

Return type:

SparkDataFrameDomain

__eq__(other)#

Return True if the schemas and group keys are identical.

Parameters:

other (Any)

Return type:

bool

__getitem__(col_name)#

Returns column descriptor for given column.

Parameters:

col_name (str)

Return type:

SparkColumnDescriptor

__contains__(value)#

Returns True if value is in the domain.

Parameters:

value (Any)

Return type:

bool