subsetter/subsetter.example.yaml at main · msg555/subsetter · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
# Specify the source database. Most parameters can be provided by an
# environmenet variable if they are not provided here.
source:
  dialect: mysql # defaults to SUBSET_SOURCE_DIALECT (mysql or postgres)
  host: my-database # defaults to SUBSET_SOURCE_HOST
  port: 3306 # defaults to SUBSET_SOURCE_PORT
  username: my_user # deafults to SUBSET_SOURCE_USERNAME
  password: my_s3cret # deafults to SUBSET_SOURCE_PASSWORD
  session_sqls: # Set any additional session variables; e.g.
    - SET @@session.max_statement_time=0
    - SET @@session.net_read_timeout=3600
    - SET @@session.net_write_timeout=3600
    - SET @@session.wait_timeout=28800
    - SET @@session.innodb_lock_wait_timeout=3600

  # Set transaction isolation level. Defaults to 'READ COMMITTED' (SERIALIZABLE
  # for sqlite).
  isolation_level: "READ COMMITTED"

  # For engines where it applies specify database name to connect to as well.
  # database: my_dbname # deafults to SUBSET_SOURCE_DATABASE

  # For sqlite the the file named by 'database' will be mounted as the 'main'
  # schema. You can mount additional databases using the 'sqlite_databases'
  # mapping:
  #
  # sqlite_databases:
  #   foo: /path/to/foo.db
  #   bar: /path/to/bar.db

  # Specify additional SSL settings
  ssl:
    require: true # deafults to SUBSET_SOURCE_SSL_REQUIRE

    # Optionally specify client certificate and key if needed
    client:
      cert: /path/to/client.crt # deafults to SUBSET_SOURCE_SSL_CLIENT_CERT
      key: /path/to/client.key # deafults to SUBSET_SORUCE_SSL_CLIENT_KEY

    server:
      rootcert: /path/to/rootcert.crt # deafults to SUBSET_SOURCE_SSL_SERVER_ROOTCERT
      verify: identity # can be 'none', 'cert', or 'identity'
                       # deafults to SUBSET_SOURCE_SSL_SERVER_VERIFY


# All plan-phase options. These options control what and how data will be
# sampled in the next phase.
planner:
  # Targets define where to start your sampling. Any additional rows brought in
  # for any tables will be done just by following foreign keys (except for
  # passthrough tables, see below)
  targets:
    # Sample 100 users randomly from db1.users
    db1.users:
      amount: 100
      # Additional possible filters shown below. Multiple filters can be provided
      # and the results will be intersected together (except all which overrides
      # everything). Additional rows will only be sampled if a row from another
      # targetted table has a dependence on them.

      # all: true
      # percent: 5.0
      # sql: "EXISTS (SELECT * FROM db1.carts WHERE db1.carts.user_id = db1.users.user_id)"
      # in:
      #   user_id: [123, 456, 789]
      # like:
      #   email: ['%@somewhere.com']

    # Sample 10% of rows in db2.gadgets
    db2.gadgets:
      percent: 10.0

  # Select tells the planner what tables you want to pull in rows for. Note that
  # the planner will pull in additional tables if there are foreign keys pointing
  # to them within the already selected tables.
  select:
  - db1.*
  - db2.gadgets
  - db2.gizmos-*

  # Add additional constraints for some tables. Constraints can only be applied
  # for tables where filtering rows would not cause foreign key constraints to be
  # violated.
  table_constraints:
    db1.user_data:
    - column: action_date
      operator: '>'
      value: '2023-07-01'

  # Passthrough tables will be copied in full. Passthrough tables must not have
  # foreign keys pointing to non-passthrough tables. The planner will make no
  # attempt to follow foreign keys involving passthrough tables. Generally
  # relatively constant, lookup data should be marked as passthrough. Tables
  # listed here do not need to be listed in the "select" list.
  passthrough:
  - db1.ui_features

  # The planner does its best to generate a plan that will not break foreign key
  # relationships. This isn't always possible without some help; here you can tell
  # the planner it shouldn't worry about certain relationships so it can generate
  # a plan. Foreign keys that exist primarily for de-normalization are especially
  # good candidates to be ignored.
  ignore_fks:
  - src_table: db2.gizmos-foo
    dst_table: db2.gizmos-bar

  # Tell the planner about any foreign key relationships that logically exist but
  # are not enforced in your database.
  extra_fks:
  - src_table: db2.gadgets
    src_columns: [user_id]
    dst_table: db1.users
    dst_columns: [id]

  # If set to true the subsetter will automatically attempt to infer foreign
  # keys. It does this by inferring that a column name that matches the name
  # of a primary key column should function as a foreign key to that table.
  # If set to 'schema' will limit matches to tables within the same schema.
  infer_foreign_keys: none # can be 'none', 'schema', or 'all'

  # By default the subsetter will automatically pull in tables referenced by
  # tables already being selected to ensure their dependent rows can be pulled
  # in with sampling. If this behavior is not desired you can set this value to
  # false.
  include_dependencies: true

# Optional sampler config. Will write sample output to a directory named
# 'output/' by default.
sampler:
  # Specify the output target. Generally this will be a database as shown below
  # although # writing output to disk is also supported. Similar to specifying
  # the source database, output destination parameters can largely be controlled
  # with environmental variables. See the documentation of the source config for
  # full option details.
  output:
    mode: database # Write output directly into database
    dialect: mysql
    host: my-dest-database # deafults to SUBSET_DESTINATION_HOST
    port: 3306 # deafults to SUBSET_DESTINATION_PORT
    username: my_dest_user # deafults to SUBSET_DESTINATION_USERNAME
    password: my_dest_s3cret  # deafults to SUBSET_DESTINATION_PASSWORD
    # database: my_dbname # deafults to SUBSET_DESTINATION_DATABASE (if needed)

    # Apply regex search and replace rules to each source "schema.table" name.
    # Will attempt to apply each replacement in the order they are listed. Example
    # below can be used to add the suffix "_out" to the schema.
    remap:
      - search: "^(\\w+)\\."
        replace: "\\1_out."

    # If a primary key uniqueness constraint is violated this parameter controls
    # what should be done with the inserted row. This should only happen if
    # sampling data into a non-empty table. Options include:
    #   error (the default): produce an error and fail sampling
    #   replace: replace the row with the new sampled row
    #   skip: do not write the sampeld row
    conflict_strategy: error

    # If merge is enabled subsetter will attempt to merge sampled data into
    # existing tables. Passthrough tables will be inserted as normal except
    # it will use the 'skip' conflict strategy. All other tables must have a
    # single-column, non-negative, integral primary key.
    #
    # Non-passthrough tables will have their primary keys remapped using the
    # below equation:
    #
    #   new_id = source_id + max(0, existing_ids...) + 1
    merge: false

  # Alternative configuration to output JSON files within a directory
  # output:
  #   mode: directory
  #   directory: some/output/dir/

  # Filters allow you to anonymize and shape your output dataset.
  filters:

    # A list of filters can be given for each table. Filters are applied in order
    # with the results of the last filter being part of the output.
    db1.users:

    # The 'null' secret will replace the sampled value with null for the given
    # columns. The 'zero' filter works the same way but instead replaces the value
    # with a 'zero' value of the same type (e.g. empty string or 0)
    - op: "null"
      columns: [big_secret]

    # The 'omit' filter will remove the given column from the output set. This can
    # be useful if that column doesn't exist in your output or it cannot be set
    # directly.
    - op: omit
      columns: [extra_column]

    # The 'constant' filter can be used to replace a column's value with a
    # constant value instead.
    - op: constant
      columns: [is_active, role]
      values: [1, admin]

    # The 'random_int' and 'random_float' filters can replace a value with a
    # random int or float in the desired range (inclusive).
    - op: random_int
      columns: [test_score]
      low: 0
      high: 100

    # The 'random_string' filter can replace column values with random textual
    # data in a variety of forms.
    - op: random_string

      # Here we generate a hex string of length 16. Other 'alphabets' include
      # "alnum", "hex_lower", "hex_upper", "digit", "alpha", "alpha_lower",
      # "alpha_upper".
      alphabet: hex
      length: 16

    # The 'random_uuid' replaces the given column with a random uuid.
    - op: random_uuid
      columns: [gizmo_uuid]

    # The remaining filters are wrappers around the corresponding Faker generator
    # (see https://faker.readthedocs.io/en/master/). The full list of supported
    # filters is:
    # - fake_email
    # - fake_first_name
    # - fake_last_name
    # - fake_name
    # - fake_phone_number
    # - fake_license_plate
    # - fake_vin
    # - fake_address
    # - fake_building_number
    # - fake_city
    # - fake_state
    # - fake_state_abbr
    # - fake_country
    # - fake_country_code
    # - fake_postal_code
    # - fake_street_address
    # - fake_street_name
    # - fake_latitude
    # - fake_longitude
    #
    # If the optional property 'unique' is set Faker will use the 'unique'
    # interface. Note that this can lead to failures if generating a lot of data
    # or setting on filters with limited output space (e.g. "fake_state")
    - op: fake_email
      columns: [email_address]
      unique: true # optional, if set uses the 'unique' faker interface

    # Arbitrary filters can be implemented dynamically. To do so create a Python
    # module that can be imported by the subsetter. The module should define a
    # factory method that will be passed a list of the requested columns and
    # should return a list of the values after filtering them. For example you
    # could write:
    #
    # class MyFilter:
    #     def __init__(self, mysuffix: str):
    #         self.suffix = mysuffix
    #
    #         def __call__(self, values):
    #             return [f"{value}-{self.suffix}" for value in values]
    - op: plugin
      columns: [col1, col2]
      module: fully.qualified.module
      class: MyFilter
      kwargs:
        mysuffix: hello!

  # The sampler supports multiplying rows for generating additional data. By
  # default this operates by mapping all primary keys and foreign keys into a
  # new key-space.
  #
  # Note: This functionality is currently only limited to integral primary
  # keys/foreign keys. Other types of keys, e.g. uuids, could be supported in
  # the future. Non-opaque keys, like email addresses, will not be supported.
  # Integer keys will be mapped into a new key-space using the equation:
  #
  #   new_id = source_id * multiplier + instance_id
  multiplicity:
    # Multiplication factor. If set to 1 the rest of these fields are ignored.
    multiplier: 10

    # Tables that should not be multiplied. Note passthrough tables are never
    # multiplied and do not need to be listed here.
    ignore_tables:
      - db1.ui_features

    # Extra columns that should have their ID-space mapped that aren't part of a
    # primary key or foreign key.
    extra_columns:
      db2.gadgets: [user_id]

    # Some tables may have multiple columns that make up the primary key. By
    # default every column will be mapped; you can set this field to disable
    # mapping of some of them.
    ignore_primary_key_columns:
      db1.gizmo: [setting_name]

  # Multiplicity and merge logic require analysis of foreign key constraints in
  # the source database. If using either of these options this `infer_foreign_keys`
  # setting will use the same logic as in planner.infer_foreign_keys to infer
  # foreign key relationships when a column name uniquely matches a
  # single-column primary key in another table.
  infer_foreign_keys: none # can be 'none', 'schema', or 'all'

  # Compaction refers to removing gaps in the sampled ID space of a specific
  # table in a column. In most cases this is unnecessary but sometimes it can
  # be helpful to keep the IDs in the sampled dataset small. Enabling compaction
  # can require more tables to be materialized on the source database and can
  # have some mild performance impacts on sampling.
  compact:
    # If set to true any tables that have single-column, integral primary key
    # will have their primary key marked for compaction.
    primary_keys: false

    # If set to true any tables that have single-column, integral,
    # auto-increment primary key will have their primary key marked for
    # compaction.
    auto_increment_keys: false

    # Mapping of additional columns that should be compacted if needed. Note if
    # multiple columns in the same table are compacted they will end up having
    # the same value.
    columns:
      db1.gizmo: [extra_id]

    # Minimum ID to set of the first sampled row for a table.
    start_key: 1