File: etl-top10nl.cfg

package info (click to toggle)
python-stetl 2.0%2Bds-3
links: PTS, VCS
area: main
in suites: bullseye
size: 90,156 kB
sloc: python: 5,103; xml: 707; sql: 430; makefile: 154; sh: 65
file content (129 lines) | stat: -rw-r--r-- 5,017 bytes
# Example of process-chains for extracting Top10NL source data from GML to PostGIS.
# A Chain is a series of Components: one Input, zero or more Filters and one Output.
# The output of a Component is connected to the input of the next Component (except for
# the final Output Component, which writes to the final destination, e.g. Postgres.
#
# Currently 3 chains are executed in the following order:
# - SQL pre:  DB initialization, delete tables, create schema
# - Main ETL chain, consists of the following components
# 1. input_big_gml_files: read input file(s)and output feature elements
# 2. xml_assembler: assemble feature elements into smaller (etree) docs
# 3. xml_schema_validator: validation against top10nl XSD of (etree) doc [OPTIONAL]
# 4. transformer_xslt: transform each (etree) doc
# 5. output_ogr2ogr: output using ogr2ogr, input are a GML etree docs, output can be any OGR output
# - SQL post:  remove duplicates
#
# Any substitutable values are specified in curly brackets e.g. {password}.
# Actual values can be passed as args to Stetl main.py or as arguments from a wrapper program
# like top10extract.py to etl.py. Here are the 3 chains:

[etl]
chains = input_sql_pre|schema_name_filter|output_postgres,
         input_big_gml_files|xml_assembler|transformer_xslt|output_ogr2ogr,
         input_sql_post|schema_name_filter|output_postgres

# alternative chains for testing
#chains = input_big_gml_files|xml_assembler|transformer_xslt|output_ogr2ogr,
#     input_big_gml_files|xml_assembler|transformer_xslt|output_std,
#          input_big_gml_files|xml_assembler|transformer_xslt|output_multifile

# Pre SQL file inputs to be executed
[input_sql_pre]
class =stetl. inputs.fileinput.StringFileInput
file_path = sql/drop-tables.sql,sql/create-schema.sql

# Post SQL file inputs to be executed
[input_sql_post]
class = stetl.inputs.fileinput.StringFileInput
file_path = sql/delete-duplicates.sql

# Generic filter to substitute Python-format string values like {schema} in string
[schema_name_filter]
class = stetl.filters.stringfilter.StringSubstitutionFilter
# format args {schema} is schema name
format_args = schema:{schema}

[output_postgres]
class = stetl.outputs.dboutput.PostgresDbOutput
database = {database}
host = {host}
port = {port}
user     = {user}
password = {password}
schema = {schema}

# The source input file(s) from dir and produce gml:featureMember elements
[input_big_gml_files]
class = stetl.inputs.fileinput.XmlElementStreamerFileInput
file_path = {gml_files}
element_tags = featureMember

# Assembles etree docs gml:featureMember elements, each with "max_elements" elements
[xml_assembler]
class = stetl.filters.xmlassembler.XmlAssembler
max_elements = {max_features}
container_doc = <?xml version="1.0" encoding="UTF-8"?>
   <gml:FeatureCollection
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xmlns:xs="http://www.w3.org/2001/XMLSchema"
    xmlns:top10nl="http://www.kadaster.nl/schemas/top10nl/v20120116"
    xmlns:gml="http://www.opengis.net/gml"
    xmlns:xlink="http://www.w3.org/1999/xlink"
    xmlns:smil20="http://www.w3.org/2001/SMIL20/"
    xmlns:smil20lang="http://www.w3.org/2001/SMIL20/Language"
    xsi:schemaLocation="http://www.kadaster.nl/schemas/top10nl/v20120116 http://www.kadaster.nl/schemas/top10nl/v20120116/TOP10NL_1_1_1.xsd">
    </gml:FeatureCollection >
element_container_tag = FeatureCollection

# Transforms into simple feature data (single geometry, single attrs)
[transformer_xslt]
class = stetl.filters.xsltfilter.XsltFilter
script = top10-split_v1.1.1.xsl

# The ogr2ogr command-line, may use any output here, as long as
# the input is a GML file. The "temp_file" is where etree-docs
# are saved. It has to be the same file as in the ogr2ogr command.
# TODO: find a way to use a GML-stream through stdin to ogr2ogr
[output_ogr2ogr]
class = stetl.outputs.ogroutput.Ogr2OgrOutput
temp_file = {temp_dir}/top10-tmp.gml
gfs_file = top10-v1.1.1.gfs
# lco will only be added to ogr2ogr on first run
lco = -lco LAUNDER=YES -lco PRECISION=NO
# spatial_extent, tra
# translates to -spat xmin ymin xmax ymax
spatial_extent = {spatial_extent}
ogr2ogr_cmd = ogr2ogr
    -append 
    -f PostgreSQL 
   "PG:dbname={database} host={host} port={port} user={user} password={password} active_schema={schema}" 
   -gt 65536 
   -a_srs epsg:28992  
   -s_srs epsg:28992
   {temp_dir}/top10-tmp.gml
   {multi_opts}
	--config PG_USE_COPY YES


# Validator for XML
[xml_schema_validator]
class = stetl.filters.xmlvalidator.XmlSchemaValidator
xsd = http://www.kadaster.nl/schemas/top10nl/v20120116/TOP10NL_1_1_1.xsd
enabled = True

# Below Alternative outputs for testing

# Send to stdout
[output_std]
class = stetl.outputs.standardoutput.StandardXmlOutput

[output_file]
class = stetl.outputs.fileoutput.FileOutput
file_path = output/top10nl-fc.gml

# Output multiple files ala Top10 file chunks GML
# Use numbering as in file expression.
[output_multifile]
class = stetl.outputs.fileoutput.MultiFileOutput
file_path = output/top10nl-%03d.gml