Python API Reference
Document the Python modules and functions for programmatic token generation.
Core Modules
from opentoken.attributes.person.birth_date_attribute import BirthDateAttribute
from opentoken.attributes.person.first_name_attribute import FirstNameAttribute
from opentoken.attributes.person.last_name_attribute import LastNameAttribute
from opentoken.attributes.person.postal_code_attribute import PostalCodeAttribute
from opentoken.attributes.person.sex_attribute import SexAttribute
from opentoken.attributes.person.social_security_number_attribute import SocialSecurityNumberAttribute
from opentoken.tokens.token_definition import TokenDefinition
from opentoken.tokens.token_generator import TokenGenerator
from opentoken.tokens.tokenizer.sha256_tokenizer import SHA256Tokenizer
from opentoken.tokentransformer.encrypt_token_transformer import EncryptTokenTransformer
from opentoken.tokentransformer.hash_token_transformer import HashTokenTransformer
Person Attribute Dict
OpenToken’s Python library represents a person’s values as a dict keyed by attribute class:
person_attributes = {
FirstNameAttribute: "John",
LastNameAttribute: "Doe",
BirthDateAttribute: "1980-01-15",
SexAttribute: "Male",
PostalCodeAttribute: "98004",
SocialSecurityNumberAttribute: "123-45-6789",
}
Normalization and validation are handled internally by TokenGenerator using the attribute implementations loaded via AttributeLoader.
TokenDefinition
TokenDefinition encapsulates the built-in T1–T5 rule definitions.
token_definition = TokenDefinition()
TokenGenerator
TokenGenerator validates/normalizes inputs and produces token signatures and tokens.
Methods
| Method | Return Type | Description |
|---|---|---|
get_all_token_signatures(person_attributes) |
Dict[str, str] |
Generates signatures for all rules (debug/logging) |
get_all_tokens(person_attributes) |
TokenGeneratorResult |
Generates tokens for all rules and captures invalid/blank info |
get_invalid_person_attributes(person_attributes) |
Set[str] |
Validates all provided attribute values |
Example
tokenizer = SHA256Tokenizer([
HashTokenTransformer("HashingSecret"),
EncryptTokenTransformer("Secret-Encryption-Key-Goes-Here."),
])
generator = TokenGenerator(TokenDefinition(), tokenizer)
invalid = generator.get_invalid_person_attributes(person_attributes)
if invalid:
print(f"Invalid attributes: {sorted(invalid)}")
result = generator.get_all_tokens(person_attributes)
for rule_id, token in result.tokens.items():
print(f"{rule_id}: {token}")
Token Transformers
Transform token signatures into encrypted or hashed tokens.
HashTokenTransformer
One-way hashing without encryption.
hasher = HashTokenTransformer("YourHashingSecret")
signature = "DOE|J|MALE|1980-01-15"
hashed_token = hasher.transform(signature)
# Returns: Base64-encoded HMAC-SHA256 hash
EncryptTokenTransformer
Full encryption with AES-256-GCM.
encryptor = EncryptTokenTransformer(
encryption_key="Secret-Encryption-Key-Goes-Here." # Exactly 32 chars
)
signature = "DOE|J|MALE|1980-01-15"
encrypted_token = encryptor.transform(signature)
# Returns: Base64-encoded encrypted token
Complete Example
from opentoken.attributes.person.birth_date_attribute import BirthDateAttribute
from opentoken.attributes.person.first_name_attribute import FirstNameAttribute
from opentoken.attributes.person.last_name_attribute import LastNameAttribute
from opentoken.attributes.person.postal_code_attribute import PostalCodeAttribute
from opentoken.attributes.person.sex_attribute import SexAttribute
from opentoken.attributes.person.social_security_number_attribute import SocialSecurityNumberAttribute
from opentoken.tokens.token_definition import TokenDefinition
from opentoken.tokens.token_generator import TokenGenerator
from opentoken.tokens.tokenizer.sha256_tokenizer import SHA256Tokenizer
from opentoken.tokentransformer.encrypt_token_transformer import EncryptTokenTransformer
from opentoken.tokentransformer.hash_token_transformer import HashTokenTransformer
def generate_tokens():
record_id = "patient_001"
person_attributes = {
FirstNameAttribute: "John",
LastNameAttribute: "Doe",
BirthDateAttribute: "1980-01-15",
SexAttribute: "Male",
PostalCodeAttribute: "98004",
SocialSecurityNumberAttribute: "123-45-6789",
}
tokenizer = SHA256Tokenizer([
HashTokenTransformer("HashingSecret"),
EncryptTokenTransformer("Secret-Encryption-Key-Goes-Here."),
])
generator = TokenGenerator(TokenDefinition(), tokenizer)
invalid = generator.get_invalid_person_attributes(person_attributes)
if invalid:
print(f"Invalid attributes: {sorted(invalid)}")
return
result = generator.get_all_tokens(person_attributes)
for rule_id, token in result.tokens.items():
print(f"{record_id},{rule_id},{token}")
if __name__ == "__main__":
generate_tokens()
Batch Processing
For processing multiple records:
import csv
from opentoken.attributes.person.birth_date_attribute import BirthDateAttribute
from opentoken.attributes.person.first_name_attribute import FirstNameAttribute
from opentoken.attributes.person.last_name_attribute import LastNameAttribute
from opentoken.attributes.person.postal_code_attribute import PostalCodeAttribute
from opentoken.attributes.person.sex_attribute import SexAttribute
from opentoken.attributes.person.social_security_number_attribute import SocialSecurityNumberAttribute
from opentoken.tokens.token_definition import TokenDefinition
from opentoken.tokens.token_generator import TokenGenerator
from opentoken.tokens.tokenizer.sha256_tokenizer import SHA256Tokenizer
from opentoken.tokentransformer.encrypt_token_transformer import EncryptTokenTransformer
from opentoken.tokentransformer.hash_token_transformer import HashTokenTransformer
def process_csv(input_path, output_path, hashing_secret, encryption_key):
tokenizer = SHA256Tokenizer([
HashTokenTransformer(hashing_secret),
EncryptTokenTransformer(encryption_key),
])
generator = TokenGenerator(TokenDefinition(), tokenizer)
with open(input_path, 'r') as infile, open(output_path, 'w', newline='') as outfile:
reader = csv.DictReader(infile)
writer = csv.writer(outfile)
writer.writerow(['RecordId', 'RuleId', 'Token'])
for row in reader:
record_id = row.get('RecordId', '')
person_attributes = {
FirstNameAttribute: row.get('FirstName', ''),
LastNameAttribute: row.get('LastName', ''),
BirthDateAttribute: row.get('BirthDate', ''),
SexAttribute: row.get('Sex', ''),
PostalCodeAttribute: row.get('PostalCode', ''),
SocialSecurityNumberAttribute: row.get('SSN', ''),
}
invalid = generator.get_invalid_person_attributes(person_attributes)
if invalid:
continue
result = generator.get_all_tokens(person_attributes)
for rule_id, token in result.tokens.items():
writer.writerow([record_id, rule_id, token])
PySpark Integration
For distributed processing on Spark, use the opentoken_pyspark bridge:
from opentoken_pyspark import OpenTokenProcessor
processor = OpenTokenProcessor(
hashing_secret="HashingSecret",
encryption_key="EncryptionKey-32Characters-Here",
)
# df must include the standard person columns (or aliases), e.g.:
# RecordId, FirstName, LastName, BirthDate, Sex, PostalCode, SSN
df_tokens = processor.process_dataframe(df)
df_tokens.show()
For overlap analysis between two tokenized datasets, use:
from opentoken_pyspark import OpenTokenOverlapAnalyzer
analyzer = OpenTokenOverlapAnalyzer("EncryptionKey-32Characters-Here")
results = analyzer.analyze_overlap(tokens_df1, tokens_df2, ["T1", "T2"])
analyzer.print_summary(results)
See Spark or Databricks for end-to-end PySpark examples.
Cross-Language Parity
OpenToken guarantees identical output between Java and Python:
# This Python code produces the exact same tokens as equivalent Java code
person_attributes = {
FirstNameAttribute: "John",
LastNameAttribute: "Doe",
BirthDateAttribute: "1980-01-15",
SexAttribute: "Male",
PostalCodeAttribute: "98004",
SocialSecurityNumberAttribute: "123-45-6789",
}
Verify parity with:
cd tools/interoperability
python java_python_interoperability_test.py
Error Handling
try:
tokenizer = SHA256Tokenizer([
HashTokenTransformer("HashingSecret"),
EncryptTokenTransformer("Secret-Encryption-Key-Goes-Here."),
])
generator = TokenGenerator(TokenDefinition(), tokenizer)
person_attributes = {
FirstNameAttribute: "", # Empty - will be invalid
LastNameAttribute: "Doe",
BirthDateAttribute: "invalid-date", # Bad format
SexAttribute: "Unknown", # Not Male/Female
PostalCodeAttribute: "98004",
SocialSecurityNumberAttribute: "123-45-6789",
}
invalid = generator.get_invalid_person_attributes(person_attributes)
if invalid:
raise ValueError(f"Invalid attributes: {sorted(invalid)}")
except ValueError as e:
print(f"Validation error: {e}")
Installation
# From repository
cd lib/python/opentoken
pip install -e .
# Development dependencies
pip install -r dev-requirements.txt
Next Steps
- Java API Reference - Cross-language reference
- Token Registration - Add custom tokens
- CLI Reference - Command-line usage
- Spark Integration - Distributed processing