From ac08a562d00e5faf9465fe7417b233b55182daa1 Mon Sep 17 00:00:00 2001 From: chaopower Date: Tue, 26 Dec 2023 10:18:15 +0800 Subject: [PATCH] =?UTF-8?q?=E8=B0=83=E6=95=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- codes/cromwell.examples.conf | 771 +++++++++++++++++++++++++++++------ codes/run_wdl.py | 4 +- pipeline.wdl | 27 +- wdl/chemo.wdl | 1 + wdl/hereditary.wdl | 1 + wdl/msi.wdl | 1 + wdl/neoantigen.wdl | 1 + wdl/pollution.wdl | 5 +- wdl/postprocess.wdl | 2 + 9 files changed, 683 insertions(+), 130 deletions(-) diff --git a/codes/cromwell.examples.conf b/codes/cromwell.examples.conf index f40a815..8dd5d60 100755 --- a/codes/cromwell.examples.conf +++ b/codes/cromwell.examples.conf @@ -1,122 +1,120 @@ -# This is an example of how you can use the LocalExample backend to define -# a new backend provider. *This is not a complete configuration file!* The -# content here should be copy pasted into the backend -> providers section -# of cromwell.example.backends/cromwell.examples.conf in the root of the repository. -# You should uncomment lines that you want to define, and read carefully to customize -# the file. If you have any questions, please open an issue at -# https://www.github.com/broadinstitute/cromwell/issues +# This is a "default" Cromwell example that is intended for you you to start with +# and edit for your needs. Specifically, you will be interested to customize +# the configuration based on your preferred backend (see the backends section +# below in the file). For backend-specific examples for you to copy paste here, +# please see the cromwell.backend.examples folder in the repository. The files +# there also include links to online documentation (if it exists) -# Documentation -# https://cromwell.readthedocs.io/en/stable/backends/Local/ +# This line is required. It pulls in default overrides from the embedded cromwell +# `reference.conf` (in core/src/main/resources) needed for proper performance of cromwell. +include required(classpath("application")) -# Define a new backend provider. +# Cromwell HTTP server settings +webservice { + #port = 8000 + #interface = 0.0.0.0 + #binding-timeout = 5s + #instance.name = "reference" +} -LocalExample { - - # The actor that runs the backend. In this case, it's the Shared File System (SFS) ConfigBackend. - actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" - - # The backend custom configuration. - config { - - # Optional limits on the number of concurrent jobs - #concurrent-job-limit = 5 - - # If true submits scripts to the bash background using "&". Only usefull for dispatchers that do NOT submit - # the job and then immediately return a scheduled job id. - run-in-background = true - - # `temporary-directory` creates the temporary directory for commands. - # - # If this value is not set explicitly, the default value creates a unique temporary directory, equivalent to: - # temporary-directory = "$(mktemp -d \"$PWD\"/tmp.XXXXXX)" - # - # The expression is run from the execution directory for the script. The expression must create the directory - # if it does not exist, and then return the full path to the directory. - # - # To create and return a non-random temporary directory, use something like: - # temporary-directory = "$(mkdir -p /tmp/mydir && echo /tmp/mydir)" - - # `script-epilogue` configures a shell command to run after the execution of every command block. - # - # If this value is not set explicitly, the default value is `sync`, equivalent to: - # script-epilogue = "sync" - # - # To turn off the default `sync` behavior set this value to an empty string: - # script-epilogue = "" - - # `glob-link-command` specifies command used to link glob outputs, by default using hard-links. - # If filesystem doesn't allow hard-links (e.g., beeGFS), change to soft-links as follows: - # glob-link-command = "ln -sL GLOB_PATTERN GLOB_DIRECTORY" - - # The list of possible runtime custom attributes. - runtime-attributes = """ - String? docker - String? docker_user - """ - - # Submit string when there is no "docker" runtime attribute. - submit = "/usr/bin/env bash ${script}" - - # Submit string when there is a "docker" runtime attribute. - submit-docker = """ - docker run \ - --rm -i \ - ${"--user " + docker_user} \ - --entrypoint ${job_shell} \ - -v ${cwd}:${docker_cwd} \ - ${docker} ${script} - """ - - # Root directory where Cromwell writes job results. This directory must be - # visible and writeable by the Cromwell process as well as the jobs that Cromwell - # launches. - root = "cromwell-executions" - - # Root directory where Cromwell writes job results in the container. This value - # can be used to specify where the execution folder is mounted in the container. - # it is used for the construction of the docker_cwd string in the submit-docker - # value above. - dockerRoot = "/cromwell-executions" - - # File system configuration. - filesystems { - - # For SFS backends, the "local" configuration specifies how files are handled. - local { - - # Try to hard link (ln), then soft-link (ln -s), and if both fail, then copy the files. - localization: [ - "hard-link", "soft-link", "copy" - ] - - # Call caching strategies - caching { - # When copying a cached result, what type of file duplication should occur. - # For more information check: https://cromwell.readthedocs.io/en/stable/backends/HPC/#shared-filesystem - duplication-strategy: [ - "hard-link", "soft-link", "copy" - ] - - # Strategy to determine if a file has been used before. - # For extended explanation and alternative strategies check: https://cromwell.readthedocs.io/en/stable/Configuring/#call-caching - hashing-strategy: "md5" - - # When true, will check if a sibling file with the same name and the .md5 extension exists, and if it does, use the content of this file as a hash. - # If false or the md5 does not exist, will proceed with the above-defined hashing strategy. - check-sibling-md5: false - } - } - } - - # The defaults for runtime attributes if not provided. - default-runtime-attributes { - failOnStderr: false - continueOnReturnCode: 0 +akka { + # Optionally set / override any akka settings + http { + server { + # Increasing these timeouts allow rest api responses for very large jobs + # to be returned to the user. When the timeout is reached the server would respond + # `The server was not able to produce a timely response to your request.` + # https://gatkforums.broadinstitute.org/wdl/discussion/10209/retrieving-metadata-for-large-workflows + # request-timeout = 20s + # idle-timeout = 20s } } } +# Cromwell "system" settings +system { + # If 'true', a SIGINT will trigger Cromwell to attempt to abort all currently running jobs before exiting + abort-jobs-on-terminate = true + + # If 'true', a SIGTERM or SIGINT will trigger Cromwell to attempt to gracefully shutdown in server mode, + # in particular clearing up all queued database writes before letting the JVM shut down. + # The shutdown is a multi-phase process, each phase having its own configurable timeout. See the Dev Wiki for more details. + graceful-server-shutdown = true + + # Cromwell will cap the number of running workflows at N + max-concurrent-workflows = 10000 + + # Cromwell will launch up to N submitted workflows at a time, regardless of how many open workflow slots exist + max-workflow-launch-count = 100 + + # Number of seconds between workflow launches + #new-workflow-poll-rate = 20 + + # Since the WorkflowLogCopyRouter is initialized in code, this is the number of workers + #number-of-workflow-log-copy-workers = 10 + + # Default number of cache read workers + #number-of-cache-read-workers = 25 + + io { + # throttle { + # # Global Throttling - This is mostly useful for GCS and can be adjusted to match + # # the quota availble on the GCS API + # #number-of-requests = 100000 + # #per = 100 seconds + # } + + # Number of times an I/O operation should be attempted before giving up and failing it. + number-of-attempts = 5 + } + + # Maximum number of input file bytes allowed in order to read each type. + # If exceeded a FileSizeTooBig exception will be thrown. + input-read-limits { + + #lines = 128000 + + #bool = 7 + + #int = 19 + + #float = 50 + + #string = 128000 + + #json = 128000 + + #tsv = 128000 + + #map = 128000 + + #object = 128000 + } + + abort { + # These are the default values in Cromwell, in most circumstances there should not be a need to change them. + + # How frequently Cromwell should scan for aborts. + scan-frequency: 30 seconds + + # The cache of in-progress aborts. Cromwell will add entries to this cache once a WorkflowActor has been messaged to abort. + # If on the next scan an 'Aborting' status is found for a workflow that has an entry in this cache, Cromwell will not ask + # the associated WorkflowActor to abort again. + cache { + enabled: true + # Guava cache concurrency. + concurrency: 1 + # How long entries in the cache should live from the time they are added to the cache. + ttl: 10 minutes + # Maximum number of entries in the cache. + size: 200000 + } + } + + # Cromwell reads this value into the JVM's `networkaddress.cache.ttl` setting to control DNS cache expiration + dns-cache-ttl: 3 minutes +} + workflow-options { # These workflow options will be encrypted when stored in the database #encrypted-fields: [] @@ -139,9 +137,554 @@ workflow-options { #workflow-type: WDL # When a workflow type version is not provided on workflow submission, this specifies the default type version. - #workflow-type-version: "draft-2" + workflow-type-version: "draft-2" # To set a default hog group rather than defaulting to workflow ID: - # hogGroup: "static" + hogGroup: "static" } +} + +# Optional call-caching configuration. +call-caching { + # Allows re-use of existing results for jobs you've already run + # (default: false) + enabled = false + + # Whether to invalidate a cache result forever if we cannot reuse them. Disable this if you expect some cache copies + # to fail for external reasons which should not invalidate the cache (e.g. auth differences between users): + # (default: true) + #invalidate-bad-cache-results = true + + # The maximum number of times Cromwell will attempt to copy cache hits before giving up and running the job. + #max-failed-copy-attempts = 1000000 + + # blacklist-cache { + # # The call caching blacklist cache is off by default. This cache is used to blacklist cache hits based on cache + # # hit ids or buckets of cache hit paths that Cromwell has previously failed to copy for permissions reasons. + # enabled: true + # + # # A blacklist grouping can be specified in workflow options which will inform the blacklister which workflows + # # should share a blacklist cache. + # groupings { + # workflow-option: call-cache-blacklist-group + # concurrency: 10000 + # ttl: 2 hours + # size: 1000 + # } + # + # buckets { + # # Guava cache concurrency. + # concurrency: 10000 + # # How long entries in the cache should live from the time of their last access. + # ttl: 20 minutes + # # Maximum number of entries in the cache. + # size: 1000 + # } + # + # hits { + # # Guava cache concurrency. + # concurrency: 10000 + # # How long entries in the cache should live from the time of their last access. + # ttl: 20 minutes + # # Maximum number of entries in the cache. + # size: 100000 + # } + # + # } +} + +# Google configuration +google { + + #application-name = "cromwell" + + # Default: just application default + #auths = [ + + # Application default + #{ + # name = "application-default" + # scheme = "application_default" + #}, + + # Use a static service account + #{ + # name = "service-account" + # scheme = "service_account" + # Choose between PEM file and JSON file as a credential format. They're mutually exclusive. + # PEM format: + # service-account-id = "my-service-account" + # pem-file = "/path/to/file.pem" + # JSON format: + # json-file = "/path/to/file.json" + #} + + # Use service accounts provided through workflow options + #{ + # name = "user-service-account" + # scheme = "user_service_account" + #} + #] +} + +docker { + hash-lookup { + # Set this to match your available quota against the Google Container Engine API + #gcr-api-queries-per-100-seconds = 1000 + + # Time in minutes before an entry expires from the docker hashes cache and needs to be fetched again + #cache-entry-ttl = "20 minutes" + + # Maximum number of elements to be kept in the cache. If the limit is reached, old elements will be removed from the cache + #cache-size = 200 + + # How should docker hashes be looked up. Possible values are "local" and "remote" + # "local": Lookup hashes on the local docker daemon using the cli + # "remote": Lookup hashes on docker hub, gcr, gar, quay + #method = "remote" + } +} + +engine { + # This instructs the engine which filesystems are at its disposal to perform any IO operation that it might need. + # For instance, WDL variables declared at the Workflow level will be evaluated using the filesystems declared here. + # If you intend to be able to run workflows with this kind of declarations: + # workflow { + # String str = read_string("gs://bucket/my-file.txt") + # } + # You will need to provide the engine with a gcs filesystem + # Note that the default filesystem (local) is always available. + filesystems { + # gcs { + # auth = "application-default" + # # Google project which will be billed for the requests + # project = "google-billing-project" + # } + local { + #enabled: true + } + } +} + +# You probably don't want to override the language factories here, but the strict-validation and enabled fields might be of interest: +# +# `enabled`: Defaults to `true`. Set to `false` to disallow workflows of this language/version from being run. +# `strict-validation`: Specifies whether workflows fail if the inputs JSON (or YAML) file contains values which the workflow did not ask for (and will therefore have no effect). +languages { + WDL { + versions { + "draft-2" { + language-factory = "languages.wdl.draft2.WdlDraft2LanguageFactory" + config { + # strict-validation: true + enabled: true + caching { + # # WDL Draft 2 namespace caching is off by default, this value must be set to true to enable it. + enabled: true + # # Guava cache concurrency + concurrency: 4 + # # How long entries in the cache should live from the time of their last access. + ttl: 20 minutes + # # Maximum number of entries in the cache (i.e. the number of workflow source + imports => namespace entries). + size: 10000 + } + } + } + # draft-3 is the same as 1.0 so files should be able to be submitted to Cromwell as 1.0 + # "draft-3" { + # language-factory = "languages.wdl.draft3.WdlDraft3LanguageFactory" + # config { + # strict-validation: true + # enabled: true + # } + # } + "1.0" { + # 1.0 is just a rename of draft-3, so yes, they really do use the same factory: + # language-factory = "languages.wdl.draft3.WdlDraft3LanguageFactory" + # config { + # strict-validation: true + # enabled: true + # } + } + } + } +} + +# Here is where you can define the backend providers that Cromwell understands. +# The default is a local provider. +# To add additional backend providers, you should copy paste additional backends +# of interest that you can find in the cromwell.example.backends folder +# folder at https://www.github.com/broadinstitute/cromwell +# Other backend providers include SGE, SLURM, Docker, udocker, Singularity. etc. +# Don't forget you will need to customize them for your particular use case. +backend { + # Override the default backend. + default = "LocalExample" + + # The list of providers. + providers { + # Copy paste the contents of a backend provider in this section + # Examples in cromwell.example.backends include: + # LocalExample: What you should use if you want to define a new backend provider + # AWS: Amazon Web Services + # TES: protocol defined by GA4GH + # TESK: the same, with kubernetes support + # Google Pipelines, v2 (PAPIv2) + # Docker + # Singularity: a container safe for HPC + # Singularity+Slurm: and an example on Slurm + # udocker: another rootless container solution + # udocker+slurm: also exemplified on slurm + # HtCondor: workload manager at UW-Madison + # LSF: the Platform Load Sharing Facility backend + # SGE: Sun Grid Engine + # SLURM: workload manager + + # Note that these other backend examples will need tweaking and configuration. + # Please open an issue https://www.github.com/broadinstitute/cromwell if you have any questions + + # The local provider is included by default. This is an example. + # Define a new backend provider. + LocalExample { + # The actor that runs the backend. In this case, it's the Shared File System (SFS) ConfigBackend. + actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" + + # The backend custom configuration. + config { + + # Optional limits on the number of concurrent jobs + #concurrent-job-limit = 5 + + # If true submits scripts to the bash background using "&". Only usefull for dispatchers that do NOT submit + # the job and then immediately return a scheduled job id. + run-in-background = true + + # `temporary-directory` creates the temporary directory for commands. + # + # If this value is not set explicitly, the default value creates a unique temporary directory, equivalent to: + # temporary-directory = "$(mktemp -d \"$PWD\"/tmp.XXXXXX)" + # + # The expression is run from the execution directory for the script. The expression must create the directory + # if it does not exist, and then return the full path to the directory. + # + # To create and return a non-random temporary directory, use something like: + # temporary-directory = "$(mkdir -p /tmp/mydir && echo /tmp/mydir)" + + # `script-epilogue` configures a shell command to run after the execution of every command block. + # + # If this value is not set explicitly, the default value is `sync`, equivalent to: + script-epilogue = "" + # + # To turn off the default `sync` behavior set this value to an empty string: + # script-epilogue = "" + + # `glob-link-command` specifies command used to link glob outputs, by default using hard-links. + # If filesystem doesn't allow hard-links (e.g., beeGFS), change to soft-links as follows: + # glob-link-command = "ln -sL GLOB_PATTERN GLOB_DIRECTORY" + + # The list of possible runtime custom attributes. + runtime-attributes = """ + String? docker + String? docker_user + """ + + # Submit string when there is no "docker" runtime attribute. + submit = "/usr/bin/env bash ${script}" + + # Submit string when there is a "docker" runtime attribute. + submit-docker = """ + docker run \ + --rm -i \ + ${"--user " + docker_user} \ + --entrypoint ${job_shell} \ + -v ${cwd}:${docker_cwd} \ + ${docker} ${docker_script} + """ + + # Root directory where Cromwell writes job results. This directory must be + # visible and writeable by the Cromwell process as well as the jobs that Cromwell + # launches. + root = "cromwell-executions" + + # Root directory where Cromwell writes job results in the container. This value + # can be used to specify where the execution folder is mounted in the container. + # it is used for the construction of the docker_cwd string in the submit-docker + # value above. + dockerRoot = "/cromwell-executions" + + # File system configuration. + filesystems { + + # For SFS backends, the "local" configuration specifies how files are handled. + local { + + # Try to hard link (ln), then soft-link (ln -s), and if both fail, then copy the files. + localization: [ + "hard-link", "soft-link", "copy" + ] + # An experimental localization strategy called "cached-copy" is also available for SFS backends. + # This will copy a file to a cache and then hard-link from the cache. It will copy the file to the cache again + # when the maximum number of hardlinks for a file is reached. The maximum number of hardlinks can be set with: + # max-hardlinks: 950 + + # Call caching strategies + caching { + # When copying a cached result, what type of file duplication should occur. Attempted in the order listed below: + duplication-strategy: [ + "hard-link", "soft-link", "copy" + ] + + # Possible values: file, path, path+modtime + # "file" will compute an md5 hash of the file content. + # "path" will compute an md5 hash of the file path. This strategy will only be effective if the duplication-strategy (above) is set to "soft-link", + # in order to allow for the original file path to be hashed. + # "path+modtime" will compute an md5 hash of the file path and the last modified time. The same conditions as for "path" apply here. + # Default: file + hashing-strategy: "file" + + # When true, will check if a sibling file with the same name and the .md5 extension exists, and if it does, use the content of this file as a hash. + # If false or the md5 does not exist, will proceed with the above-defined hashing strategy. + check-sibling-md5: false + } + } + } + + # The defaults for runtime attributes if not provided. + default-runtime-attributes { + failOnStderr: false + continueOnReturnCode: 0 + } + } + } + } +} + +services { + MetadataService { + + # This class is the "default" database backed implementation: + # class = "cromwell.services.metadata.impl.MetadataServiceActor" + # config { + # # For the standard MetadataService implementation, cromwell.services.metadata.impl.MetadataServiceActor: + # # Set this value to "Inf" to turn off metadata summary refresh. The default value is currently "1 second". + # metadata-summary-refresh-interval = "1 second" + # + # # Set this value to the maximum number of metadata rows to be considered per summarization cycle. + # metadata-summary-refresh-limit = 5000 + # + # # For higher scale environments, e.g. many workflows and/or jobs, DB write performance for metadata events + # # can improved by writing to the database in batches. Increasing this value can dramatically improve overall + # # performance but will both lead to a higher memory usage as well as increase the risk that metadata events + # # might not have been persisted in the event of a Cromwell crash. + # # + # # For normal usage the default value of 200 should be fine but for larger/production environments we recommend a + # # value of at least 500. There'll be no one size fits all number here so we recommend benchmarking performance and + # # tuning the value to match your environment. + # db-batch-size = 200 + # + # # Periodically the stored metadata events will be forcibly written to the DB regardless of if the batch size + # # has been reached. This is to prevent situations where events wind up never being written to an incomplete batch + # # with no new events being generated. The default value is currently 5 seconds + # db-flush-rate = 5 seconds + # + # # Kill metadata SQL queries that run so long that the associated request will likely already have timed out. + # # The intention is to return resources to the system within a reasonable timeframe to avoid OOM incidents. + # # See also `akka.http.server.request-timeout`. + # metadata-read-query-timeout = "Inf" + # + # # Limit the number of rows from METADATA_ENTRY that will be fetched to produce metadata responses. + # # This limit takes into account the effects of `includeKey`, `excludeKey` and `includeSubworkflows` + # # request parameters; only the rows required to be retrieved from the database to compose the response + # # count against this limit. + # metadata-read-row-number-safety-threshold = 1000000 + # + # metadata-write-statistics { + # # Not strictly necessary since the 'metadata-write-statistics' section itself is enough for statistics to be recorded. + # # However, this can be set to 'false' to disable statistics collection without deleting the section. + # enabled: true + # + # # How many workflows to maintain statistics for concurrently. At ~4x "max-concurrent-workflows", this would be + # # *relatively* resilient to large scatters of subworkflows without risking an uncapped expansion in memory usage. + # # Note that cache entries expire after 4h of not being accessed, regardless of whether this limit is reached or not. + # cache-size = 20000 + # + # # How many metadata rows to alert at each increment of. At 100k, there will be alert messages every time a + # # workflow publishes another 100k rows of metadata. + # metadata-row-alert-interval = 100000 + # + # # Whether to include subworkflow rows in both individual counts and also include them in their parent counts + # # (and their parent's parent, and so on up to the root) + # sub-workflow-bundling = true + # } + # } + + # Alternative 1: Pub sub implementation: + # class = "cromwell.services.metadata.impl.MetadataServiceActor" + # config { + # # For the Google PubSub MetadataService implementation: cromwell.services.metadata.impl.pubsub.PubSubMetadataServiceActor: + # # Google project + # project = "my-project" + # # The auth *must* be a service-account auth with JSON auth. + # auth = "service-account" + # # The PubSub topic to write to. Will be created if it doesn't already exist. Defaults to "cromwell-metadata" + # topic = "cromwell-metadata" + # # An optional PubSub subscription name. If supplied and if it doesn't already exist, it will be created and + # # subscribed to the topic + # # subscription = "optional-subscription" + # # An application name to set on your PubSub interaction. + # appName = "cromwell" + # } + } + + Instrumentation { + # StatsD - Send metrics to a StatsD server over UDP + # class = "cromwell.services.instrumentation.impl.statsd.StatsDInstrumentationServiceActor" + # config { + # hostname = "localhost" + # port = 8125 + # prefix = "" # can be used to prefix all metrics with an api key for example + # flush-rate = 1 second # rate at which aggregated metrics will be sent to statsd + # } + + # Stackdriver - Send metrics to Google's monitoring API + # class = "cromwell.services.instrumentation.impl.stackdriver.StackdriverInstrumentationServiceActor" + # config { + # # auth scheme can be `application_default` or `service_account` + # auth = "service-account" + # google-project = "my-project" + # # rate at which aggregated metrics will be sent to Stackdriver API, must be 1 minute or more. + # flush-rate = 1 minute + # # below 3 keys are attached as labels to each metric. `cromwell-perf-test-case` is specifically meant for perf env. + # cromwell-instance-identifier = "cromwell-101" + # cromwell-instance-role = "role" + # cromwell-perf-test-case = "perf-test-1" + # } + } + HealthMonitor { + config { + + ##### + # Choose what to monitor: + ##### + + # If you want to check the availability of the PAPI or PAPIv2 services, list them here. + # If provided, all values here *MUST* be valid PAPI or PAPIv2 backend names in the Backends stanza. + # NB: requires 'google-auth-name' to be set + # check-papi-backends: [ PAPIv2 ] + + # If you want to check connection to GCS (NB: requires 'google-auth-name' and 'gcs-bucket-to-check' to be set): + # check-gcs: true + + # If you want to check database connectivity: + # check-engine-database: true + + # If you want to check dockerhub availability: + # check-dockerhub: true + + ##### + # General health monitor configuration: + ##### + + # How long to wait between status check sweeps + # check-refresh-time = 5 minutes + + # For any given status check, how long to wait before assuming failure + # check-timeout = 1 minute + + # For any given status datum, the maximum time a value will be kept before reverting back to "Unknown" + # status-ttl = 15 minutes + + # For any given status check, how many times to retry a failure before setting status to failed. Note this + # is the number of retries before declaring failure, not the total number of tries which is 1 more than + # the number of retries. + # check-failure-retry-count = 3 + + # For any given status check, how long to wait between failure retries. + # check-failure-retry-interval = 30 seconds + + ##### + # GCS- and PAPI-specific configuration options: + ##### + + # The name of an authentication scheme to use for e.g. pinging PAPI and GCS. This should be either an application + # default or service account auth, otherwise things won't work as there'll not be a refresh token where you need + # them. + # google-auth-name = application-default + + # A *bucket* in GCS to periodically stat to check for connectivity. This must be accessible by the auth mode + # specified by google-auth-name + # NB: This is a *bucket name*, not a URL and not an *object*. With 'some-bucket-name', Cromwell would ping gs://some-bucket-name + # gcs-bucket-to-check = some-bucket-name + } + } + LoadController { + config { + # The load controller service will periodically look at the status of various metrics its collecting and make an + # assessment of the system's load. If necessary an alert will be sent to the rest of the system. + # This option sets how frequently this should happen + # To disable load control, set this to "Inf" + # control-frequency = 5 seconds + } + } +} + +database { + # mysql example + #driver = "slick.driver.MySQLDriver$" + + # see all possible parameters and default values here: + # http://slick.lightbend.com/doc/3.2.0/api/index.html#slick.jdbc.JdbcBackend$DatabaseFactoryDef@forConfig(String,Config,Driver):Database + #db { + # driver = "com.mysql.jdbc.Driver" + # url = "jdbc:mysql://host/cromwell?rewriteBatchedStatements=true" + # user = "user" + # password = "pass" + # connectionTimeout = 5000 + #} + + # For batch inserts the number of inserts to send to the DB at a time + # insert-batch-size = 2000 + + migration { + # For databases with a very large number of symbols, selecting all the rows at once can generate a variety of + # problems. In order to avoid any issue, the selection is paginated. This value sets how many rows should be + # retrieved and processed at a time, before asking for the next chunk. + #read-batch-size = 100000 + + # Because a symbol row can contain any arbitrary wdl value, the amount of metadata rows to insert from a single + # symbol row can vary from 1 to several thousands (or more). To keep the size of the insert batch from growing out + # of control we monitor its size and execute/commit when it reaches or exceeds writeBatchSize. + #write-batch-size = 100000 + } + + # To customize the metadata database connection, create a block under `database` with the metadata database settings. + # + # For example, the default database stores all data in memory. This commented out block would store `metadata` in an + # hsqldb file, without modifying the internal engine database connection. + # + # The value `${uniqueSchema}` is always replaced with a unqiue UUID on each cromwell startup. + # + # This feature should be considered experimental and likely to change in the future. + + #metadata { + # profile = "slick.jdbc.HsqldbProfile$" + # db { + # driver = "org.hsqldb.jdbcDriver" + # url = "jdbc:hsqldb:file:metadata-${uniqueSchema};shutdown=false;hsqldb.tx=mvcc" + # connectionTimeout = 3000 + # } + #} + + # Postgresql example + #database { + # profile = "slick.jdbc.PostgresProfile$" + # db { + # driver = "org.postgresql.Driver" + # url = "jdbc:postgresql://localhost:5432/cromwell" + # user = "" + # password = "" + # port = 5432 + # connectionTimeout = 5000 + # } + #} } \ No newline at end of file diff --git a/codes/run_wdl.py b/codes/run_wdl.py index 6289f4a..214f6a5 100755 --- a/codes/run_wdl.py +++ b/codes/run_wdl.py @@ -94,7 +94,9 @@ def run(barcode, normal, umi, input_dir, output_dir, project, cancer, probe, wdl # f'{"-Dcall-caching.enabled=false " if uncache else ""}' # f'-Dconfig.file=/home/zhangchao/project/pipeline/workflow/script/cromwell.examples.conf ' \ - cmd4 = f'/usr/bin/java -jar $WORKFLOW/software/cromwell-51.jar run --inputs {jsfile_path} {wdl}' + cmd4 = f'/home/install/product/workflow/software/jdk-17.0.7+7/bin/java ' \ + f'-Dconfig.file=$WORKFLOW/codes/cromwell.examples.conf ' \ + f'-jar $WORKFLOW/software/cromwell-86.jar run {wdl} --inputs {jsfile_path} --metadata-output run.log' # cmd = f'{cmd1}; {cmd2}; {cmd3}; {cmd4}' cmd = f'{cmd3}; {cmd4}' diff --git a/pipeline.wdl b/pipeline.wdl index 7f92fdc..b341c7d 100644 --- a/pipeline.wdl +++ b/pipeline.wdl @@ -1,17 +1,18 @@ -import "./wdl/qc.wdl" -import "./wdl/alignment.wdl" -import "./wdl/call_mutation.wdl" -import "./wdl/fusion.wdl" -import "./wdl/statistics.wdl" -import "./wdl/cnv.wdl" -import "./wdl/msi.wdl" -import "./wdl/chemo.wdl" -import "./wdl/hereditary.wdl" -import "./wdl/pollution.wdl" -import "./wdl/tmb.wdl" -import "./wdl/postprocess.wdl" -import "./wdl/neoantigen.wdl" +import "wdl/qc.wdl" +import "wdl/alignment.wdl" +import "wdl/call_mutation.wdl" +import "wdl/fusion.wdl" +import "wdl/statistics.wdl" +import "wdl/cnv.wdl" +import "wdl/msi.wdl" +import "wdl/chemo.wdl" +import "wdl/hereditary.wdl" +import "wdl/pollution.wdl" +import "wdl/tmb.wdl" +import "wdl/postprocess.wdl" +import "wdl/neoantigen.wdl" + workflow pipeline { diff --git a/wdl/chemo.wdl b/wdl/chemo.wdl index d5598b9..bba4d73 100755 --- a/wdl/chemo.wdl +++ b/wdl/chemo.wdl @@ -13,6 +13,7 @@ task run_chemo { fi chemo.py -d $DATABASE/chemo_database.xlsx -probe ${probe} -n ${name} -v ${vcf} -o ${output_dir}/chemo -c ${cancer} -p ${project} + >>> output { diff --git a/wdl/hereditary.wdl b/wdl/hereditary.wdl index fdc0926..596f7df 100755 --- a/wdl/hereditary.wdl +++ b/wdl/hereditary.wdl @@ -11,6 +11,7 @@ task run_hereditary { fi hereditary.py -d $DATABASE/hereditary_database.xlsx -p ${project} -n ${name} -f ${filter_txt} -o ${output_dir}/hereditary + >>> output { diff --git a/wdl/msi.wdl b/wdl/msi.wdl index 3b31417..53a76d7 100755 --- a/wdl/msi.wdl +++ b/wdl/msi.wdl @@ -42,6 +42,7 @@ task msi_paired { -e ${bed} \ -b 10 \ -o ${output_dir}/msi/${name}.msi.txt + >>> output { diff --git a/wdl/neoantigen.wdl b/wdl/neoantigen.wdl index d8f01ef..a283fe0 100755 --- a/wdl/neoantigen.wdl +++ b/wdl/neoantigen.wdl @@ -147,6 +147,7 @@ task run_neoantigen { dos2unix ${output_dir}/neoantigen/MHC_Class_I/*.all_epitopes.tsv netchop.pl ${output_dir} ${tumor} + >>> } diff --git a/wdl/pollution.wdl b/wdl/pollution.wdl index a52bd8f..737cf8f 100755 --- a/wdl/pollution.wdl +++ b/wdl/pollution.wdl @@ -18,6 +18,7 @@ task run_pollution { -p ${probe} \ -b $PUBLIC/pollution/${probe}_contaminate_ref.bed \ -c $PUBLIC/pollution/${probe}_contaminate_cnvkit.bed + >>> output { @@ -39,7 +40,7 @@ workflow call_pollution { if (run) { if (defined(normal)) { - call run_pollution as run_pollution_paired { + call run_pollution as run_pollution_paired { input: name=tumor, output_dir=output_dir, @@ -50,7 +51,7 @@ workflow call_pollution { } if (!defined(normal)) { - call run_pollution as run_pollution_single { + call run_pollution as run_pollution_single { input: name=tumor, output_dir=output_dir, diff --git a/wdl/postprocess.wdl b/wdl/postprocess.wdl index c19816e..0b43dfc 100755 --- a/wdl/postprocess.wdl +++ b/wdl/postprocess.wdl @@ -13,6 +13,7 @@ task run_post { String output_dir String cancer String project + command <<< if [ ! -d ${output_dir}/report ];then @@ -21,6 +22,7 @@ task run_post { indication.pl ${output_dir} ${cancer} ${project} sample_post.py -s ${name} -o ${output_dir} postprocess.py -n ${name} -s ${normal} -c ${output_dir} -o ${output_dir}/report/${name}.merged_file.xlsx + >>> output {