1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
|
#!/usr/bin/env python3
"""
Practical example: Generate synthetic data from a ShEx schema
This example demonstrates how to generate synthetic RDF data from
the example ShEx schema in the repository.
"""
import pyrudof
import os
import sys
def generate_from_schema():
"""Generate data from the simple.shex example schema"""
# Path to example schema
schema_path = "../../examples/simple.shex"
# Check if schema exists
if not os.path.exists(schema_path):
print(f"Warning: Schema file not found at {schema_path}")
print("This is a demonstration of the API, even without the actual file.")
schema_exists = False
else:
schema_exists = True
print(f"✓ Found schema: {schema_path}")
# Create configuration
config = pyrudof.GeneratorConfig()
# Configure generation
config.set_entity_count(20)
config.set_seed(42) # For reproducible results
# Configure output
output_dir = "/tmp/pyrudof_generate"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "generated_simple.ttl")
stats_file = os.path.join(output_dir, "generated_simple_stats.json")
config.set_output_path(output_file)
config.set_output_format(pyrudof.OutputFormat.Turtle)
config.set_write_stats(True)
config.set_compress(False)
# Configure schema
config.set_schema_format(pyrudof.SchemaFormat.ShEx)
config.set_cardinality_strategy(pyrudof.CardinalityStrategy.Balanced)
print("\nConfiguration:")
print(f" Entities to generate: {config.get_entity_count()}")
print(f" Random seed: {config.get_seed()}")
print(f" Output file: {config.get_output_path()}")
print(f" Statistics file: {stats_file}")
# Create generator
print("\n✓ Creating DataGenerator...")
generator = pyrudof.DataGenerator(config)
print("✓ DataGenerator created successfully")
if schema_exists:
try:
# Load schema and generate
print(f"\nLoading schema from: {schema_path}")
generator.run(schema_path)
print(f"\n✓ Data generation completed!")
print(f" Output written to: {output_file}")
if os.path.exists(stats_file):
print(f" Statistics written to: {stats_file}")
# Show file size
if os.path.exists(output_file):
size = os.path.getsize(output_file)
print(f" Generated file size: {size} bytes")
# Show first few lines
print(f"\nFirst 10 lines of generated data:")
print("-" * 60)
with open(output_file, 'r') as f:
for i, line in enumerate(f):
if i >= 10:
break
print(line.rstrip())
print("-" * 60)
except Exception as e:
print(f"\n✗ Error during generation: {e}")
import traceback
traceback.print_exc()
return 1
else:
print("\nSkipping actual generation (schema file not found)")
print("To run with a real schema, provide a valid ShEx or SHACL file.")
return 0
if __name__ == "__main__":
print("=" * 60)
print("Practical Example: Generate Synthetic RDF Data")
print("=" * 60)
result = generate_from_schema()
print("\n" + "=" * 60)
if result == 0:
print("Example completed successfully!")
else:
print("Example completed with errors.")
print("=" * 60)
sys.exit(result)
|