Configuration Schemas

Dataset config schema

The dataset config is the base format for simulation configs (see below). A dataset config just specifies some generic details about the files in any dataset (whether or not it was produced by a simulation), and how to load them. The simulation config extends this by adding further details about the simulation which produced the dataset.

$schema: "http://json-schema.org/draft-07/schema#"
$id: "py-pkgdata:dnadna.schemas/dataset.yml"
description: >-
    config file for a dataset--in this case a dataset is a collection of
    files containing SNP data organized in one of the dataset formats
    understood by DNADNA; this is used for both loading data from simulated
    data, or other datasets on which we want to perform prediction
type: "object"
properties:
    data_root:
        type: "string"
        format: "filename"
        description: >-
            root directory for all files related to the dataset, either as
            an absolute path, or as a path relative to the location of this
            config file

    dataset_name:
        type: "string"
        description: >-
            a name to give the dataset; used in generating filenames and
            logging output

    scenario_params_path:
        type: "string"
        format: "filename"
        description: >-
            path to the CSV file containing the per-scenario
            parameters used in this simulation, either as an
            absolute path, or as a path relative to this config file

    data_source:
        type: "object"
        description: >-
            options for describing the format in which the dataset is
            organized; currently only one format ("dnadna", the native
            format for DNADNA) is understood, but others may be added later
        properties:
            format:
                type: "string"
                description: >-
                    a unique label identifying the data format; the format
                    property determines what reader is used for simulation
                    data, and any further options in data_source may depend
                    on the format
                enum: ["dnadna"]
        required: ["format"]
        oneOf:
            - {"$ref": "dataset_formats/dnadna.yml"}

    position_format:
        type: "object"
        description: >-
            options related to the format of the positions array; i.e. is it
            given as absolute positions in a chromosome, or distances between
            positions, is it normalized to [0.0, 1.0), etc.
        properties:
            chromosome_size:
                type: "integer"
                description: >-
                    number of base pairs in the chromsome; required only for
                    converting from normalized to un-normalized positions
            initial_position:
                type: ["integer", "number"]
                description: >-
                    initial position to use for circular chromosomes; when
                    converting from circular distances to positions for example
                    an initial position is needed
            distance:
                type: "boolean"
                description: >-
                    if true, the positions array represents distances between
                    positions, instead of absolute positions
            normalized:
                type: "boolean"
                description: >-
                    if true, the positions (whether they are absolute or
                    relative) are normalized to the range [0.0, 1.0);
                    in this case it is also necessary to provide the
                    chromosome size if it is needed to convert to the
                    un-normalized values
            circular:
                type: "boolean"
                description: >-
                    whether or not the chromosome is circular
                default: false

    ignore_missing:
      description: >-
          ignore missing scenarios or replicates when loading
          data samples; in the case of missing samples the next
          one is tried until one is found
      type: "boolean"
      default: false

    cache_validation_set:
        description: >-
            used only during training, keeps the validation set cached
            in-memory, which can greatly speed up evaluation; however, if the
            validation set is too large to fit in available memory this can be
            disabled
        type: "boolean"
        default: false

    dnadna_version: {"$ref": "definitions.yml#/definitions/version"}

required:
    - data_root
    - dataset_name
    - data_source

Example

# config file for a dataset--in this case a dataset is a collection
# of files containing SNP data organized in one of the dataset
# formats understood by DNADNA; this is used for both loading data
# from simulated data, or other datasets on which we want to perform
# prediction

# used only during training, keeps the validation set cached in-
# memory, which can greatly speed up evaluation; however, if the
# validation set is too large to fit in available memory this can be
# disabled
cache_validation_set: false

# root directory for all files related to the dataset, either as an
# absolute path, or as a path relative to the location of this
# config file
data_root: /builds/mlgenetics/dnadna/dnadna/defaults

# options for describing the format in which the dataset is
# organized; currently only one format ("dnadna", the native format
# for DNADNA) is understood, but others may be added later
data_source:
  # string template for per-replicate simulation files in Python
  # string template format; the following template variables may be
  # used: 'name', the same as the name property used in this config
  # file; 'scenario', the scenario number, and 'replicate', the
  # replicate number of the scenario (if there are multiple
  # replicates); path separators may also be used in the template to
  # form a directory structure
  filename_format: scenario_{scenario}/{dataset_name}_{scenario}_{replicate}.npz

  # a unique label identifying the data format; the format property
  # determines what reader is used for simulation data, and any
  # further options in data_source may depend on the format
  format: dnadna

  # keys in the NPZ file for the SNP matrix and position array
  # respectively; the "dnadna" format usually prescribes this to be
  # ["SNP", "POS"] but it can be overridden by this property
  keys:
  - SNP
  - POS

# a name to give the dataset; used in generating filenames and
# logging output
dataset_name: generic

# ignore missing scenarios or replicates when loading data samples;
# in the case of missing samples the next one is tried until one is
# found
ignore_missing: false

# path to the CSV file containing the per-scenario parameters used
# in this simulation, either as an absolute path, or as a path
# relative to this config file
scenario_params_path: /builds/mlgenetics/dnadna/dnadna/defaults/scenario_params.csv

Simulation config schema

The simulation config format is the same as the one for dataset but with an additional simulator_name property, as well as Simulator-specific properties.

$schema: "http://json-schema.org/draft-07/schema#"
$id: "py-pkgdata:dnadna.schemas/simulation.yml"
type: "object"
description: >-
    JSON Schema (YAML-formatted) for basic properties of a simulation on which
    a model will be trained.
allOf:
    - {"$ref": "dataset.yml"}
    -
        properties:
            plugins: {"$ref": "plugins.yml"}

            simulator_name:
                type: "string"
                description: >-
                    for simulations output by the `dnadna simulation` command, the
                    name of the simulation class used to initialize and run the
                    simulation

            n_scenarios:
                type: "integer"
                description: >-
                    number of different scenarios simulated; each scenario
                    is a parameterization of the simulation with a different
                    set of (possibly random) parameter values; each scenario
                    may have one or more "replicates"--simulations using the
                    same parameters, but with different randomized
                    outputs--the number of replicates of each scenario
                    should be listed in the scenario parameters table
                minimum: 1
                default: 1

            seed:
                type: ["integer", "null"]
                description: >-
                    fixed seed to use for seeding the random number
                    generator at the beginning of the simulation; if absent
                    then the PRNG's default seeding method is used
                default: null

            summary_statistics: {"$ref": "summary-statistics.yml#/definitions/summary_statistics"}

        required:
            - n_scenarios
            - scenario_params_path

    - {"$ref": "py-obj:dnadna.schemas.plugins.simulator"}

Example

# JSON Schema (YAML-formatted) for basic properties of a simulation
# on which a model will be trained.

# used only during training, keeps the validation set cached in-
# memory, which can greatly speed up evaluation; however, if the
# validation set is too large to fit in available memory this can be
# disabled
cache_validation_set: false

# root directory for all files related to the dataset, either as an
# absolute path, or as a path relative to the location of this
# config file
data_root: /builds/mlgenetics/dnadna/dnadna/defaults

# options for describing the format in which the dataset is
# organized; currently only one format ("dnadna", the native format
# for DNADNA) is understood, but others may be added later
data_source:
  # string template for per-replicate simulation files in Python
  # string template format; the following template variables may be
  # used: 'name', the same as the name property used in this config
  # file; 'scenario', the scenario number, and 'replicate', the
  # replicate number of the scenario (if there are multiple
  # replicates); path separators may also be used in the template to
  # form a directory structure
  filename_format: scenario_{scenario}/{dataset_name}_{scenario}_{replicate}.npz

  # a unique label identifying the data format; the format property
  # determines what reader is used for simulation data, and any
  # further options in data_source may depend on the format
  format: dnadna

  # keys in the NPZ file for the SNP matrix and position array
  # respectively; the "dnadna" format usually prescribes this to be
  # ["SNP", "POS"] but it can be overridden by this property
  keys:
  - SNP
  - POS

# a name to give the dataset; used in generating filenames and
# logging output
dataset_name: one_event
generation_time: 25

# ignore missing scenarios or replicates when loading data samples;
# in the case of missing samples the next one is tried until one is
# found
ignore_missing: false
max: 4.698970004336019
mutation_rate: 1.0e-08
n_max: 4.698970004336019
n_min: 3.6989700043360187
n_replicates: 3
n_samples: 50

# number of different scenarios simulated; each scenario is a
# parameterization of the simulation with a different set of
# (possibly random) parameter values; each scenario may have one or
# more "replicates"--simulations using the same parameters, but with
# different randomized outputs--the number of replicates of each
# scenario should be listed in the scenario parameters table
n_scenarios: 100
recombination_rate: 1.0e-08

# path to the CSV file containing the per-scenario parameters used
# in this simulation, either as an absolute path, or as a path
# relative to this config file
scenario_params_path: /builds/mlgenetics/dnadna/docs/one_event_params.csv

# fixed seed to use for seeding the random number generator at the
# beginning of the simulation; if absent then the PRNG's default
# seeding method is used
seed: null
segment_length: 2000000.0

# for simulations output by the `dnadna simulation` command, the
# name of the simulation class used to initialize and run the
# simulation
simulator_name: one_event
tmax: 100000
tmin: 2000

Preprocessing config schema

$schema: "http://json-schema.org/draft-07/schema#"
$id: "py-pkgdata:dnadna.schemas/preprocessing.yml"
type: "object"

description: >-
    required configuration for the `dnadna preprocess` command

properties:
    dataset:
        description: the dataset/simulation configuration
        "$ref": "dataset.yml"

    model_root:
        type: "string"
        format: "filename!"
        description: >-
            root directory for all training runs of this model / training
            configuration
        default: "."

    model_name:
        type: "string"
        description: >-
            unique name to give to models trained with this configuration;
            individual training runs will prepend this to the run_id
        minLength: 1

    learned_params:
        description: >-
            description of the parameters the network will be trained on
        "$ref": "param-set.yml"

    dataset_splits:
        description: >-
            how to split the dataset between training, validation, and test
            sets

            numbers given for each subset are ratios which must sum to 1; if
            less than 1 some portion of the dataset will be omitted, and if
            more than 1 an error is raised

            dataset splits are performed after unusable scenarios are omitted
            according to the pre-processing parameters (min_snp, etc.)
        type: "object"
        properties:
            training:
                description: portion of the dataset to use for training
                type: "number"
                exclusiveMinimum: 0
                exclusiveMaximum: 1
            validation:
                description: portion of the dataset to use for validation
                type: "number"
                exclusiveMinimum: 0
                exclusiveMaximum: 1
            test:
                description: >-
                    portion of the dataset to use for the test set (optional)
                type: "number"
                minimum: 0
                exclusiveMaximum: 1
            unused:
                description: >-
                    portion of the dataset which will not be used (optional,
                    reserved for custom purposes)
                type: "number"
                minimum: 0
                exclusiveMaximum: 1
        required: ["training", "validation"]
        additionalProperties: false

    preprocessing:
        description: >-
            these are parameters used for data pre-processing prior to
            training; they determine the subset of the dataset that will be
            used for a training run
        type: "object"
        properties:
            min_snp:
                description: "minimum number of SNPs each sample should have"
                type: ["integer", "null"]
                minimum: 1
                default: null
            min_indiv:
                description: "minimum number of individuals in each sample"
                type: ["integer", "null"]
                minimum: 1
                default: null
            seed:
                description: >-
                    random seed to initialize PRNG; in particular
                    randomization is used during pre-processing to separate
                    scenarios into the training and validation sets, and
                    specifying a seed ensures the split is consistent
                    between runs
                type: ["integer", "null"]
                default: null
            n_workers:
                description: >-
                    if greater than 0, the number of worker processes to
                    use for preprocessing; using multiple workers can in
                    some cases speed up preprocessing
                type: "integer"
                minimum: 0
                default: 0

    dnadna_version: {"$ref": "definitions.yml#/definitions/version"}

    plugins: {"$ref": "plugins.yml"}

required:
    - dataset
    - model_root
    - model_name
    - learned_params
    - dataset_splits
    - preprocessing

Example

# required configuration for the `dnadna preprocess` command

# the dataset/simulation configuration
dataset:
  # used only during training, keeps the validation set cached in-
  # memory, which can greatly speed up evaluation; however, if the
  # validation set is too large to fit in available memory this can
  # be disabled
  cache_validation_set: false

  # root directory for all files related to the dataset, either as
  # an absolute path, or as a path relative to the location of this
  # config file
  data_root: /builds/mlgenetics/dnadna/dnadna/defaults

  # options for describing the format in which the dataset is
  # organized; currently only one format ("dnadna", the native
  # format for DNADNA) is understood, but others may be added later
  data_source:
    # string template for per-replicate simulation files in Python
    # string template format; the following template variables may
    # be used: 'name', the same as the name property used in this
    # config file; 'scenario', the scenario number, and 'replicate',
    # the replicate number of the scenario (if there are multiple
    # replicates); path separators may also be used in the template
    # to form a directory structure
    filename_format: scenario_{scenario}/{dataset_name}_{scenario}_{replicate}.npz

    # a unique label identifying the data format; the format
    # property determines what reader is used for simulation data,
    # and any further options in data_source may depend on the
    # format
    format: dnadna

    # keys in the NPZ file for the SNP matrix and position array
    # respectively; the "dnadna" format usually prescribes this to
    # be ["SNP", "POS"] but it can be overridden by this property
    keys:
    - SNP
    - POS

  # a name to give the dataset; used in generating filenames and
  # logging output
  dataset_name: generic

  # ignore missing scenarios or replicates when loading data
  # samples; in the case of missing samples the next one is tried
  # until one is found
  ignore_missing: false

  # path to the CSV file containing the per-scenario parameters used
  # in this simulation, either as an absolute path, or as a path
  # relative to this config file
  scenario_params_path: /builds/mlgenetics/dnadna/dnadna/defaults/scenario_params.csv

# how to split the dataset between training, validation, and test
# sets numbers given for each subset are ratios which must sum to 1;
# if less than 1 some portion of the dataset will be omitted, and if
# more than 1 an error is raised dataset splits are performed after
# unusable scenarios are omitted according to the pre-processing
# parameters (min_snp, etc.)
dataset_splits:
  # portion of the dataset to use for training
  training: 0.7

  # portion of the dataset to use for validation
  validation: 0.3

# description of the parameters the network will be trained on
learned_params:
  param1:
    log_transform: false
    loss_func: MSE
    loss_weight: 1
    tied_to_position: false
    type: regression
  param2:
    classes: 2
    loss_func: Cross Entropy
    loss_weight: 1
    type: classification

# unique name to give to models trained with this configuration;
# individual training runs will prepend this to the run_id
model_name: default

# root directory for all training runs of this model / training
# configuration
model_root: /builds/mlgenetics/dnadna/dnadna/defaults

# these are parameters used for data pre-processing prior to
# training; they determine the subset of the dataset that will be
# used for a training run
preprocessing:
  # minimum number of individuals in each sample
  min_indiv: 1

  # minimum number of SNPs each sample should have
  min_snp: 1

  # if greater than 0, the number of worker processes to use for
  # preprocessing; using multiple workers can in some cases speed up
  # preprocessing
  n_workers: 2

  # random seed to initialize PRNG; in particular randomization is
  # used during pre-processing to separate scenarios into the
  # training and validation sets, and specifying a seed ensures the
  # split is consistent between runs
  seed: null

Learned params schema

# JSON Schema (YAML-formatted) for details about parameters to learn in a
# training run
$schema: "http://json-schema.org/draft-07/schema#"
$id: "py-pkgdata:dnadna.schemas/param-set.yml"
description: >-
    details of parameters to learn in training; it may be a mapping of param
    names to param configurations, or a list thereof (this is the case when
    using YAML ordered mappings, which are translated into a lists of
    single-element mappings); in the latter case the specified order of the
    parameters is preserved when mapping parameters to optimization targets
oneOf:
    - {"$ref": "#/definitions/parameters"}
    - type: "array"
      items: {"$ref": "#/definitions/parameters"}
      minItems: 1
      errorMsg:
          minItems: at least one parameter must be declared in {property}

definitions:
    loss_func:
        description: >-
            name of the loss function to apply to this parameter; the name
            is the same as the class or function implementing the loss
            function (e.g. MSELoss) minus the "Loss" in the name and is
            case-insensitvie (e.g. "mse" for MSELoss); spaces are also
            allowed in the function name for clarity (e.g. "cross entropy")
            and are simply ignored when looking up the associated
            class/function; the default value depends on the parameter
            type; spaces
        type: "string"
        minLength: 1

    loss_weight:
        description: >-
            additional weight by which to multiply the parameter's loss
            after applying the loss function, allowing some parameters to
            be weighted more heavily than others; by default all parameters
            are weighted equally
        type: "number"
        minimum: 0
        maximum: 1
        default: 1

    parameters:
        description: a mapping of parameter names to their details
        type: "object"
        minProperties: 1
        errorMsg:
            minProperties: at least one parameter must be declared in {property}
        # We use additionalProperties here because the property names are the
        # parameter names, which are arbitrary strings; hence every key/value
        # pair in this object is assumed to be a parameter definition
        additionalProperties: {"$ref": "#/definitions/parameter"}

    parameter:
        description: details about a single parameter
        type: "object"
        errorMsg:
            type: >-
                must be an object like:

                    param_name:
                        type: regression

                or:

                    param_name:
                        type: classification
                        classes:
                            - class1
                            - class2
                            - class3

                where classes can be a list of class names or just the number
                of classes
        required: ["type"]
        properties:
            type:
                description: >-
                    parameter type; either "regression" or "classification".
                    Classification parameters require the additional "classes"
                    property
                enum: ["regression", "classification"]

        # Select between either "regression" which has no other properties
        # and classification which requires the additional "classes"
        # properties.  This could possibly be expressed more succinctly with
        # JSONSchema Draft-07 conditionals, but this is roughly equivalent.
        #
        # TODO: This also implements different defaults for loss_func depending
        # on the parameter type; however I don't think it works yet to
        # automatically supply this default during validation; so that's a
        # special case that might have to be checked...
        oneOf:
            -
                properties:
                    type: {"const": "regression"}
                    loss_func:
                        "$ref": "#/definitions/loss_func"
                        default: "MSE"
                    loss_weight:
                        "$ref": "#/definitions/loss_weight"
                        default: 1
                    log_transform:
                        description: >-
                            whether or not a log transform should be applied to
                            this parameter's known values during
                            pre-processing; training is then performed with the
                            log values (regression parameters only)
                        type: "boolean"
                        default: false
                    tied_to_position:
                        description: >-
                            values of this parameter are SNP positions, so any
                            transformations or normalizations of the position
                            array must also be applied to this parameter during
                            training
                        type: "boolean"
                        default: false
                additionalProperties: false
            -
                properties:
                    type: {"const": "classification"}
                    loss_func:
                        "$ref": "#/definitions/loss_func"
                        default: "Cross Entropy"
                    loss_weight:
                        "$ref": "#/definitions/loss_weight"
                        default: 1
                    classes:
                        description: >-
                            classification parameters classes, either an
                            integer giving the number of classes in the
                            parameter, or an array to give explicit names to
                            the classes (one item for each class);  class names
                            can themselves be either strings, or integers
                            (which are converted automatically to strings, as
                            they are just labels for the classes)
                        type: ["integer", "array"]
                        items:
                            type: ["integer", "string"]
                        minimum: 1
                        minItems: 1
                    n_classes:
                        description: >-
                            after pre-processing, this property contains the number
                            of classes in a classification parameter; if the
                            "classes" property is an integer this is identical;
                            otherwise it is the length of the "classes" array;
                            normally this property should not be manually specified
                        type: "integer"
                        minimum: 1
                required: ["classes"]
                additionalProperties: false

Training config schema

# JSON Schema (YAML-formatted) for inference/training parameters file.
$schema: "http://json-schema.org/draft-07/schema#"
$id: "py-pkgdata:dnadna.schemas/training.yml"
type: "object"
description: >-
    the main training configuration, typically generated from an existing
    preprocessing config file
allOf:
    - properties:
          network:
              description:
                  "name and parameters of the neural net model to train"
              properties:
                  name:
                      description: "name of the network to train"
                  params:
                      description: >-
                          options specific to the neural net model being
                          trained; these are passed as keyword arguments to the
                          net's constructor (see dnadna.net module); the schema
                          for this property depends on which model is being
                          used--model-specific schemas are found in
                          dnadna/schemas/nets, though a model may also provide
                          its schema as a .schema attribute
              default:
                  name: "SPIDNA"
              "$ref": "py-obj:dnadna.schemas.plugins.network"

          optimizer:
              description: >-
                  name and parameters of the optimizer to use; all built-in
                  optimizers from the torch.optim package are available for use
                  here, and you can also provide a custom optimizer via a
                  plugin
              default:
                  name: "Adam"
                  params:
                      learning_rate: 0.001
                      weight_decay: 0
              "$ref": "py-obj:dnadna.schemas.plugins.optimizer"

          dataset_transforms:
              "$ref": "#/definitions/transforms"
              default: []

          n_epochs:
              description: >-
                  number of epochs over which to repeat the training process
              type: "integer"
              minimum: 1
              default: 1

          evaluation_interval:
              description: >-
                  interval (number of batches processed) between two validation
                  steps; for m evaluations per epoch, set to
                  n_training_samples // (batch_size * m) where the number of
                  training samples can be found in training logs
              type: "integer"
              minimum: 1
              default: 1

          batch_size:
              description: "sample batch size to train on"
              type: "integer"
              minimum: 1
              default: 1

          loader_num_workers:
              description: "number of subprocesses to use for data loading"
              type: "integer"
              minimum: 0
              default: 0

          use_cuda:
              description: "use CUDA-capable GPU where available"
              type: "boolean"
              default: true

          cuda_device:
              description: "specifies the CUDA device index to use"
              oneOf:
                  - type: "integer"
                    minimum: 0
                  - type: "null"
              default: null

          seed:
              description: >-
                  seed for initializing the PRNG prior to a training run for
                  reproducible results; if unspecified the PRNG chooses its
                  default seeding method
              type: ["integer", "null"]
              default: null

          model_filename_format:
              type: "string"
              description: >-
                  format string for the filename of the final output model; it
                  can use the template variables model_name, run_name, and/or
                  run_id, while the required variable "checkpoint" will be
                  replaced with names like "best", "last" and other
                  intermediate checkpoints
              minLength: 1
              default: "{model_name}_{run_name}_{checkpoint}_net.pth"

          run_name_format:
              description: >-
                  format string for the name given to this run for a sequence
                  of runs of the same model; the outputs of each run are placed
                  in subdirectories of <run_path>/<model_name> with the name of
                  this run; the format string can use the template variables
                  model_name and run_id
              type: "string"
              minLength: 4
              default: "run_{run_id}"

          train_mean:
              "$ref": "#/definitions/param_stats"
              description: >-
                  mean of each regression parameter over the training set

          train_std:
              "$ref": "#/definitions/param_stats"
              description: >-
                  standard deviation of each regression parameter over the
                  training set

    # Inherits the preprocessing config format
    - {"$ref": "preprocessing.yml"}

additionalProperties: true

definitions:
    transform_list:
        type: "array"
        items: {"$ref": "py-obj:dnadna.schemas.plugins.transform"}
        default: []
    transforms:
        description: >-
            list of transforms to apply to the dataset; all optional transforms
            are disabled by default unless specified here; transforms which
            don't take any parameters can be listed just by their name, whereas
            transforms which do take parameters are given as {'name': <name>,
            'param1':, 'param2':, ...}, where the params map param names
            (specific to the transform) to their values
        oneOf:
            - "$ref": "#/definitions/transform_list"
            - type: "object"
              properties:
                  training: {"$ref": "#/definitions/transform_list"}
                  validation: {"$ref": "#/definitions/transform_list"}
                  test: {"$ref": "#/definitions/transform_list"}
              patternProperties: {"^[a-zA-Z0-9_]+$": {"$ref": "#/definitions/transform_list"}}
              additionalProperties: false
    param_stats:
        type: "object"
        description: >-
            map of learned param names to some numerical statistic (e.g.  mean,
            standard deviation, etc.) about the values of that parameter in the
            preprocessed scenario params table
        additionalProperties:
            type: "number"

Example

# the main training configuration, typically generated from an
# existing preprocessing config file

# sample batch size to train on
batch_size: 8

# specifies the CUDA device index to use
cuda_device: null

# the dataset/simulation configuration
dataset:
  # used only during training, keeps the validation set cached in-
  # memory, which can greatly speed up evaluation; however, if the
  # validation set is too large to fit in available memory this can
  # be disabled
  cache_validation_set: false

  # root directory for all files related to the dataset, either as
  # an absolute path, or as a path relative to the location of this
  # config file
  data_root: /builds/mlgenetics/dnadna/dnadna/defaults

  # options for describing the format in which the dataset is
  # organized; currently only one format ("dnadna", the native
  # format for DNADNA) is understood, but others may be added later
  data_source:
    # string template for per-replicate simulation files in Python
    # string template format; the following template variables may
    # be used: 'name', the same as the name property used in this
    # config file; 'scenario', the scenario number, and 'replicate',
    # the replicate number of the scenario (if there are multiple
    # replicates); path separators may also be used in the template
    # to form a directory structure
    filename_format: scenario_{scenario}/{dataset_name}_{scenario}_{replicate}.npz

    # a unique label identifying the data format; the format
    # property determines what reader is used for simulation data,
    # and any further options in data_source may depend on the
    # format
    format: dnadna

    # keys in the NPZ file for the SNP matrix and position array
    # respectively; the "dnadna" format usually prescribes this to
    # be ["SNP", "POS"] but it can be overridden by this property
    keys:
    - SNP
    - POS

  # a name to give the dataset; used in generating filenames and
  # logging output
  dataset_name: generic

  # ignore missing scenarios or replicates when loading data
  # samples; in the case of missing samples the next one is tried
  # until one is found
  ignore_missing: false

  # path to the CSV file containing the per-scenario parameters used
  # in this simulation, either as an absolute path, or as a path
  # relative to this config file
  scenario_params_path: /builds/mlgenetics/dnadna/dnadna/defaults/scenario_params.csv

# how to split the dataset between training, validation, and test
# sets numbers given for each subset are ratios which must sum to 1;
# if less than 1 some portion of the dataset will be omitted, and if
# more than 1 an error is raised dataset splits are performed after
# unusable scenarios are omitted according to the pre-processing
# parameters (min_snp, etc.)
dataset_splits:
  # portion of the dataset to use for training
  training: 0.7

  # portion of the dataset to use for validation
  validation: 0.3

# list of transforms to apply to the dataset; all optional
# transforms are disabled by default unless specified here;
# transforms which don't take any parameters can be listed just by
# their name, whereas transforms which do take parameters are given
# as {'name': <name>, 'param1':, 'param2':, ...}, where the params
# map param names (specific to the transform) to their values
dataset_transforms:
- crop:
    keep_polymorphic_only: true
    max_indiv: null
    max_snp: 400
- snp_format: concat
- validate_snp:
    uniform_shape: false

# interval (number of batches processed) between two validation
# steps; for m evaluations per epoch, set to n_training_samples //
# (batch_size * m) where the number of training samples can be found
# in training logs
evaluation_interval: 1

# description of the parameters the network will be trained on
learned_params:
  param1:
    log_transform: false
    loss_func: MSE
    loss_weight: 1
    tied_to_position: false
    type: regression
  param2:
    classes: 2
    loss_func: Cross Entropy
    loss_weight: 1
    type: classification

# number of subprocesses to use for data loading
loader_num_workers: 1

# format string for the filename of the final output model; it can
# use the template variables model_name, run_name, and/or run_id,
# while the required variable "checkpoint" will be replaced with
# names like "best", "last" and other intermediate checkpoints
model_filename_format: '{model_name}_{run_name}_{checkpoint}_net.pth'

# unique name to give to models trained with this configuration;
# individual training runs will prepend this to the run_id
model_name: default

# root directory for all training runs of this model / training
# configuration
model_root: /builds/mlgenetics/dnadna/dnadna/defaults

# number of epochs over which to repeat the training process
n_epochs: 1

# name and parameters of the neural net model to train
network:
  name: CustomCNN

  # net parameters for CNN
  params: {}

# name and parameters of the optimizer to use; all built-in
# optimizers from the torch.optim package are available for use
# here, and you can also provide a custom optimizer via a plugin
optimizer:
  name: Adam
  params:
    amsgrad: false
    betas:
    - 0.9
    - 0.999
    eps: 1.0e-08
    learning_rate: 0.001
    weight_decay: 0

# these are parameters used for data pre-processing prior to
# training; they determine the subset of the dataset that will be
# used for a training run
preprocessing:
  # minimum number of individuals in each sample
  min_indiv: 1

  # minimum number of SNPs each sample should have
  min_snp: 1

  # if greater than 0, the number of worker processes to use for
  # preprocessing; using multiple workers can in some cases speed up
  # preprocessing
  n_workers: 2

  # random seed to initialize PRNG; in particular randomization is
  # used during pre-processing to separate scenarios into the
  # training and validation sets, and specifying a seed ensures the
  # split is consistent between runs
  seed: null

# format string for the name given to this run for a sequence of
# runs of the same model; the outputs of each run are placed in
# subdirectories of <run_path>/<model_name> with the name of this
# run; the format string can use the template variables model_name
# and run_id
run_name_format: run_{run_id}

# seed for initializing the PRNG prior to a training run for
# reproducible results; if unspecified the PRNG chooses its default
# seeding method
seed: null

# use CUDA-capable GPU where available
use_cuda: true

Summary statistics config schema

$schema: "http://json-schema.org/draft-07/schema#"
$id: "py-pkgdata:dnadna.schemas/summary-statistics.yml"
type: "object"
description: >-
    summary statistics configuration: as summary statistics require reference
    to a simulation configuration for the simulation data to read, this
    requires a reference to a simulation config, either embedded or inherited
    from an external file; alternatively, a simulation config file may also
    contain embedded an embedded summary statistics config, in its
    `summary_statistics` property, for example,

    either a summary statistics config with an embedded/inherited simulation
    config::

        chromsome_size: 2e6
        # .. additional summary statistics properties ...
        simulation:
            # ... simulation config properties, or inherit: ...

    or you can use a simulation config with an embedded summary statistics
    config::

        data_root: "."
        name: "my_simulation"
        # ... additional simulation properties ...
        summary_statistics:
            # ... summary statistics config properties, or in inherit: ...

definitions:
    summary_statistics:
        type: "object"
        description: >-
            settings for calculating and outputting summary statistics on this
            simulation
        properties:
            plugins: {"$ref": "plugins.yml"}

            filename_format:
                type: "string"
                description: >-
                    string template for per-secenario summary statistics files;
                    for each scenario three statistics tables are output: the
                    LD (Linkage Disequilibrium) scores, SFS (Site Frequency
                    Spectrum), and the "sel" file containing additional test
                    statistics such as Tajima's D, iHS, nSL, and possibly
                    others to be implemented; this template contains up to 3
                    variables, the 'name' of the dataset, the 'scenario'
                    (integer scenario index) and 'stat' ('ld', 'sfs', or 'sel')
                default: "sumstats/scenario_{scenario}/{dataset_name}_{scenario}_{type}.csv"

            chromosome_size:
                type: "number"
                description: >-
                    number of SNP pairs in the chromosome
                # TODO: This seems arbitrary; why this number?  Should there
                # even be a default at all?
                default: 2.0e+6

            ld_options:
                type: "object"
                description: >-
                    options to pass to the LD computation
                default: {}
                properties:
                    circular:
                        type: "boolean"
                        description: >-
                            whether or not circular chromosomes are being
                            considered
                        default: false
                    distance_bins:
                        type: ["array", "integer"]
                        description: >-
                            distance bins into which to group SNPs; LD is then
                            averaged over those bins; either an array of
                            distance groups, or an integer giving the number of
                            bins to create over log space in max distance
                        default: 19  # TODO: Why 19?
                        minLength: 1
                        minimum: 1

            sfs_options:
                type: "object"
                description: >-
                    options to pass to the SFS computation
                default: {}
                properties:
                    folded:
                        type: "boolean"
                        description: >-
                            whether or not to compute the folded SFS
                        default: false

            sel_options:
                type: "object"
                description: >-
                    options to pass to the additional sel statistics
                default: {}
                properties:
                    window:
                        type: ["integer", "null"]
                        description: >-
                            number of bins into which to slice SNP positions;
                            the statistic is then computed over each window
                            instead of over all sites together; if the value is
                            0, the statistics are not binned
                        default: 100  # TODO: Why 100??
                        minimum: 0

oneOf:
    - allOf:
        - {"$ref": "#/definitions/summary_statistics"}
        -
            properties:
                simulation: {"$ref": "simulation.yml"}
            required: ["simulation"]
    - allOf:
        - {"$ref": "simulation.yml"}
        - {"required": ["summary_statistics"]}