typedschema API#

class typedschema.Column[source]#

Bases: str

A column in a named schema. It is also a string, so it behaves like a string.

You can call e.g. col() on it.

import pyspark.sql.functions as F
from typedschema import Column

name = Column(StringType(), nullable=True)

F.col(name) # works like a string
name.col    # also works
name.c      # alias for name.col -> for the lazy ones

Common PySpark functions, such as cast() or dtype(), are aliased.

Parameters:
  • dtype – the DataType

  • nullable – is it nullable?

  • meta – meta information

  • name – usually not needed, only for name classes. See Schema for more info

__init__(dtype, nullable=False, meta=None, name=None)[source]#
Parameters:
static __new__(cls, dtype, nullable=False, meta=None, name=None)[source]#
Parameters:
alias(name)[source]#

Alias this column.

Shortcut for F.col().alias()

Parameters:

name (Self | str)

Return type:

Column

property c: Column#

Alias for as col

cast(dtype)[source]#

Cast this column to a different data type.

Shortcut for F.col().cast()

Parameters:

dtype (str | DataType)

Return type:

Column

property col: Column#

Transform the column to a pyspark column

property dtype: DataType#

The data type of the column

field: StructField#
classmethod from_structfield(field)[source]#
Parameters:

field (StructField)

property name: str#

The name of the column

class typedschema.FQTN[source]#

Bases: str

Short for Fully Qualified Table Name. Simplify table name handling.

Parameters:
  • ns – The namespace aka schema aka keyspace (and sometimes aka database).

  • name – The table name

__init__(ns, name)[source]#
Parameters:
static __new__(cls, ns, name)[source]#
Parameters:
property namespace: str#
classmethod of(table_name)[source]#
Parameters:

table_name (str | Self)

class typedschema.Schema[source]#

Bases: object

A typed schema is a schema definition that has the field/column names as named attributes of the class definition.

Parameters:
  • case_sensitive – are the columns/fields case sensitive?

  • meta – key-value entries that are related to the schema. An example would be the table name. Actually, because the table name is needed quite frequently, the self.table_name property is a shortcut for meta["name"].

Examples:

>>> from pyspark.sql.types import StructField, StringType, StringType
>>> from typedschema import Schema, Column
>>>
>>> class ExampleSchema(Schema):
...     # syntax is Column(DATA_TYPE, IS_NULLABLE)
...     name = Column(StringType(), True)
...     city = Column(StringType(), True)
...     street = Column(StringType(), True)
...     # a name clash: the column is named "cols" but "cols" is also a reserved field
...     # (see `typedschema.RESERVED_FIELDS` for list)
...     # you can use a "_" as a workaround for the field and supply the name as arg to Column
...     cols_ = Column(StringType(), True, name="cols")
...     # meta is a dict. You can dump whatever you think is useful
...     # (you can also skip it, of course)
...     meta = {"default_values": {"name": "NA"}, "name": "customers"}
...     # the schema fields are considered case-insensitive
...     # in all functions, such as equality tests
...     # (case_sensitive is False by default)
...     case_sensitive = False
>>>
>>> # we have to create a object to use the full functionality
>>> # (for e.g. testing schema equality with `==`)
>>> exs = ExampleSchema()
>>> exs.table_name
'customers'
>>> exs.cols
['name', 'city', 'street', 'cols']
>>> [c.field for c in exs.cols]
[
StructField('name', StringType(), True),
StructField('city', StringType(), True),
StructField('street', StringType(), True),
StructField('cols', StringType(), True)
]
>>> exs.city.field
StructField('city', StringType(), True)
>>> exs.city
'city'
__init__(*, case_sensitive=None, meta=None)[source]#
Parameters:
case_sensitive: bool = False#
cols = []#
contains(other)[source]#
Parameters:

other (StructField | str)

property dtypes: list[tuple[str, str]]#

Data types of the columns

Returns:

a list of (column name, data type) tuples

property fields: list[StructField]#

the list of columns/fields.

get_field(name)[source]#
Return type:

StructField

isequal(other, strict_null=True)[source]#
Parameters:

other (Sequence[StructField] | Self | StructType | DataFrame)

issubset(other, strict_null=True)[source]#
Parameters:

other (Sequence[StructField] | Self | StructType | DataFrame)

issuperset(other, strict_null=True)[source]#
Parameters:

other (Sequence[StructField] | Self | StructType | DataFrame)

meta: dict[Hashable, Any] = None#
property spark_schema: StructType#

The spark schema

Representation of a spark schema that can be used to construct DataFrames.

property table_name#

returns meta["name"].

typedschema.diff_schemas(a, b)[source]#

Diff two schemas

Parameters:
Returns:

a list of tuples each tuple has the structure: (diffType, colname of a, colname of b) diffType can be

+

a is missing this col, b has it extra

-

a has this column extra, it is missing in b

(space)

no difference

>

the col is present in a and b, but the data type differs

!

the col is present in a and b, but the nullable constraint differs

typedschema.generate_schema_def(s, name='UnnamedSchema')[source]#

Generate Python code for a Schema from a spark/sequence of structfields/Schema

Parameters:
Returns:

Python code for the Schema that can be copy/pasted into your project.