diff --git a/.github/workflows/autoassig_milestone_to_issue.yml b/.github/workflows/assign-milestone-issue.yml similarity index 100% rename from .github/workflows/autoassig_milestone_to_issue.yml rename to .github/workflows/assign-milestone-issue.yml diff --git a/.github/workflows/auto-format.yml b/.github/workflows/auto-format.yml new file mode 100644 index 0000000..c7fa45f --- /dev/null +++ b/.github/workflows/auto-format.yml @@ -0,0 +1,37 @@ +name: auto-format + +on: + workflow_dispatch: + pull_request: + +env: + GH_TOKEN: ${{ github.token }} + +permissions: + contents: write + pull-requests: write + +jobs: + auto-format: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + if: github.event_name == 'pull_request' + with: + fetch-depth: 0 + ref: ${{ github.event.pull_request.head.ref }} + - uses: actions/checkout@v4 + if: github.event_name != 'pull_request' + with: + fetch-depth: 0 + + - name: format + uses: pre-commit/action@v3.0.1 + continue-on-error: true + - name: commit & push + run: | + git config --global user.name "github-actions[bot]" + git config --global user.email "41898282+github-actions[bot]@users.noreply.github.com" + git add . + git commit -m "ci: ๐Ÿค– format everything with pre-commit" && git push || echo "nothing to commit" diff --git a/.github/workflows/auto_add_reponame_labels.yml b/.github/workflows/auto_add_reponame_labels.yml deleted file mode 100644 index 658d105..0000000 --- a/.github/workflows/auto_add_reponame_labels.yml +++ /dev/null @@ -1,16 +0,0 @@ -name: Auto add reponame as label to all issues and PRs - -on: - issues: - types: - - opened - pull_request: - types: - - opened -permissions: - issues: write - pull-requests: write -jobs: - add_label: - uses: CCBR/.github/.github/workflows/add_reponame_issue_label.yml@v0.2.0 - secrets: inherit diff --git a/.github/workflows/draft-release.yml b/.github/workflows/draft-release.yml index 0bbe6bb..3036566 100644 --- a/.github/workflows/draft-release.yml +++ b/.github/workflows/draft-release.yml @@ -14,6 +14,7 @@ on: permissions: contents: write + pull-requests: write actions: write jobs: @@ -23,7 +24,7 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 # required to include tags - - uses: CCBR/actions/draft-release@main + - uses: CCBR/actions/draft-release@v0.2 with: github-token: ${{ github.token }} version-tag: ${{ github.event.inputs.version-tag }} diff --git a/.github/workflows/label-issues-repo-name.yml b/.github/workflows/label-issues-repo-name.yml new file mode 100644 index 0000000..d946f78 --- /dev/null +++ b/.github/workflows/label-issues-repo-name.yml @@ -0,0 +1,21 @@ +name: label-issues-repo-name + +on: + issues: + types: + - opened + pull_request: + types: + - opened + +permissions: + issues: write + pull-requests: write + +jobs: + add-label: + runs-on: ubuntu-latest + steps: + - uses: CCBR/actions/label-issue-repo-name@main + with: + github-token: ${{ github.token }} diff --git a/.github/workflows/post-release.yml b/.github/workflows/post-release.yml index 209014a..0726a7b 100644 --- a/.github/workflows/post-release.yml +++ b/.github/workflows/post-release.yml @@ -8,7 +8,7 @@ on: permissions: contents: write pull-requests: write - issues: write + actions: write jobs: cleanup: @@ -17,6 +17,6 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 - - uses: CCBR/actions/post-release@main + - uses: CCBR/actions/post-release@v0.2 with: github-token: ${{ github.token }} diff --git a/.test/README.md b/.test/README.md index d9503a7..3891de8 100644 --- a/.test/README.md +++ b/.test/README.md @@ -1,7 +1,8 @@ ## Test data + 2 mouse samples: -* iCre_D0 -* D4_Meso_iCre_Dox +- iCre_D0 +- D4_Meso_iCre_Dox -Each sample has 2 replicates. Each replicate has been subsampled to only include reads aligning to chr19:10,000,000-20,000,000 \ No newline at end of file +Each sample has 2 replicates. Each replicate has been subsampled to only include reads aligning to chr19:10,000,000-20,000,000 diff --git a/README.md b/README.md index ab73a15..2f0f2f4 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # ASPEN -**A**tac **S**eq **P**ip**E**li**N**e : +**A**tac **S**eq **P**ip**E**li**N**e : [CCBR](https://bioinformatics.ccr.cancer.gov/ccbr/) recommends ASPEN to effectively analyze ATAC-seq datasets on the [BIOWULF](https://hpc.nih.gov) HPC system at the [NIH](https://www.nih.gov/). @@ -12,6 +12,7 @@ ```bash module load ccbrpipeliner/7 ``` + ```bash aspen --help ``` diff --git a/config/README.md b/config/README.md index 4ac3851..e3d2055 100644 --- a/config/README.md +++ b/config/README.md @@ -1,5 +1,6 @@ ## Config files -* `config.yaml`: sets all the runtime configurations for the pipeline like output folder, resources folder, genome, tool-parameters etc. -* `samples.tsv`: tab delimited sample manifest. Defaults to the samples in the .test folder of the pipeline -* `multiqc_atacseq_config.yaml`: custom-multiqc config file for report generation -* `create_test_config_sample_manifest_files.bash`: replicates variables `PIPELINE_HOME` and `WORKDIR` etc. to create sample manifest for testing purposes + +- `config.yaml`: sets all the runtime configurations for the pipeline like output folder, resources folder, genome, tool-parameters etc. +- `samples.tsv`: tab delimited sample manifest. Defaults to the samples in the .test folder of the pipeline +- `multiqc_atacseq_config.yaml`: custom-multiqc config file for report generation +- `create_test_config_sample_manifest_files.bash`: replicates variables `PIPELINE_HOME` and `WORKDIR` etc. to create sample manifest for testing purposes diff --git a/config/multiqc_atacseq_config.yaml b/config/multiqc_atacseq_config.yaml index 8d89b1d..cffd405 100755 --- a/config/multiqc_atacseq_config.yaml +++ b/config/multiqc_atacseq_config.yaml @@ -1,115 +1,115 @@ custom_data: - Nreads: - section_name: 'Nreads' - description: 'Number of reads per replicate. Mitochondrial fraction should be < 10%.' - nrf_stats: - file_format: 'tsv' - section_name: 'NRF PBC Stats' - description: ' - Non-redundant Fraction (NRF): Number of distinct uniquely mapping reads (i.e. after removing duplicates) / Total number of reads. - PCR Bottlenecking Coefficient 1 (PBC1): (number of genomic locations where exactly one read maps uniquely) / (number of distinct genomic locations to which some read maps uniquely). - PCR Bottlenecking Coefficient 2 (PBC2): (number of genomic locations where only one read maps uniquely) / (number of genomic locations where two reads map uniquely). - The preferred values are as follows: NRF>0.9, PBC1>0.9, and PBC2>3. ' - plot_type: 'table' - pconfig: - id: 'NRF Stats' - title: 'NRF Stats table' - fld_files: - file_format: 'tsv' - section_name: 'Fragment Length Distribution' - description: 'Per sample FLD. Peak < 150bp = Nucleosome Free Peak, Peak between 150-300 bp = Mononucleosome Peak, Peak > 300bp = Dinucleosome Peak' - plot_type: 'linegraph' - pconfig: - id: 'FLD' - title: 'Fragment Length Distribution' - ylab: 'Normalized read density X 1e3' - xlab: 'Fragment Length' - xmax: 1000 - fld_stats_peaks: - file_format: 'tsv' - section_name: 'FLD Stats (Peaks)' - description: 'Presence or Absense of nucleosome-free, mono and di-nucleosome peaks in FLD. A nucleosome free region (NFR) must be present. A mononucleosome peak must be present.' - plot_type: 'table' - pconfig: - id: 'FLD Stats' - title: 'FLD Stats table' - fld_stats_details: - file_format: 'tsv' - section_name: 'FLD Stats (Fractions and Ratios)' - description: 'Fractions and Ratios of interest' - plot_type: 'table' - pconfig: - id: 'FLD Stats' - title: 'FLD Stats table' - MACS2_Peak_Annotations: - section_name: 'MACS2 Peaks' - description: 'Peaks called using MACS2. For human or mouse data, Npeaks should be >150,000, though values >100,000 may be acceptable. ' - Genrich_Peak_Annotations: - section_name: 'Genrich Peaks' - description: 'Peaks called using Genrich. For human or mouse data, Npeaks should be >150,000, though values >100,000 may be acceptable. ' - peak_width_files: - file_format: 'tsv' - section_name: 'Peak width distribution' - description: 'Peak width distribution of consensus peaks.' - plot_type: 'linegraph' - pconfig: - id: 'PWD' - title: 'Peak Width Distribution' - ylab: 'Peak Density Percentage' - xlab: 'Peak Width' - xmax: 20000 - frip_stats: - file_format: 'tsv' - section_name: 'FRiP Stats' - description: 'The fraction of reads in called peak regions (FRiP score) should be >0.3, though values greater than 0.2 are acceptable. Fraction of Reads in Peaks/DHS/Enhancers/Promoters are also reported. For human or mouse data, FRiPromoters is 12-20%' - plot_type: 'table' - pconfig: - id: 'FRiP Stats' - title: 'FRiP Stats table' - tss_files: - file_format: 'tsv' - section_name: 'TSS distribution' - description: 'Greenleaf Normalized TSS per sample distribution' - plot_type: 'linegraph' - pconfig: - id: 'TSS Enrichment' - title: 'TSS Enrichment Distribution' - ylab: 'Greenleaf Normalized TSS Enrichment' - xlab: 'Distance from TSS' - tss_knicking_sites_files: - file_format: 'tsv' - section_name: 'TSS Score Scatter' - description: 'TSS score to TSS with >20 Tn5knicking sites scatter. ' - plot_type: 'scatter' - pconfig: - id: 'TSS_scatter' - title: 'TSS_Score_Scatter' - ylab: 'TSS score' - xlab: 'Number of TSS sites with > 20 Tn5 knick sites' + Nreads: + section_name: "Nreads" + description: "Number of reads per replicate. Mitochondrial fraction should be < 10%." + nrf_stats: + file_format: "tsv" + section_name: "NRF PBC Stats" + description: " + Non-redundant Fraction (NRF): Number of distinct uniquely mapping reads (i.e. after removing duplicates) / Total number of reads. + PCR Bottlenecking Coefficient 1 (PBC1): (number of genomic locations where exactly one read maps uniquely) / (number of distinct genomic locations to which some read maps uniquely). + PCR Bottlenecking Coefficient 2 (PBC2): (number of genomic locations where only one read maps uniquely) / (number of genomic locations where two reads map uniquely). + The preferred values are as follows: NRF>0.9, PBC1>0.9, and PBC2>3. " + plot_type: "table" + pconfig: + id: "NRF Stats" + title: "NRF Stats table" + fld_files: + file_format: "tsv" + section_name: "Fragment Length Distribution" + description: "Per sample FLD. Peak < 150bp = Nucleosome Free Peak, Peak between 150-300 bp = Mononucleosome Peak, Peak > 300bp = Dinucleosome Peak" + plot_type: "linegraph" + pconfig: + id: "FLD" + title: "Fragment Length Distribution" + ylab: "Normalized read density X 1e3" + xlab: "Fragment Length" + xmax: 1000 + fld_stats_peaks: + file_format: "tsv" + section_name: "FLD Stats (Peaks)" + description: "Presence or Absense of nucleosome-free, mono and di-nucleosome peaks in FLD. A nucleosome free region (NFR) must be present. A mononucleosome peak must be present." + plot_type: "table" + pconfig: + id: "FLD Stats" + title: "FLD Stats table" + fld_stats_details: + file_format: "tsv" + section_name: "FLD Stats (Fractions and Ratios)" + description: "Fractions and Ratios of interest" + plot_type: "table" + pconfig: + id: "FLD Stats" + title: "FLD Stats table" + MACS2_Peak_Annotations: + section_name: "MACS2 Peaks" + description: "Peaks called using MACS2. For human or mouse data, Npeaks should be >150,000, though values >100,000 may be acceptable. " + Genrich_Peak_Annotations: + section_name: "Genrich Peaks" + description: "Peaks called using Genrich. For human or mouse data, Npeaks should be >150,000, though values >100,000 may be acceptable. " + peak_width_files: + file_format: "tsv" + section_name: "Peak width distribution" + description: "Peak width distribution of consensus peaks." + plot_type: "linegraph" + pconfig: + id: "PWD" + title: "Peak Width Distribution" + ylab: "Peak Density Percentage" + xlab: "Peak Width" + xmax: 20000 + frip_stats: + file_format: "tsv" + section_name: "FRiP Stats" + description: "The fraction of reads in called peak regions (FRiP score) should be >0.3, though values greater than 0.2 are acceptable. Fraction of Reads in Peaks/DHS/Enhancers/Promoters are also reported. For human or mouse data, FRiPromoters is 12-20%" + plot_type: "table" + pconfig: + id: "FRiP Stats" + title: "FRiP Stats table" + tss_files: + file_format: "tsv" + section_name: "TSS distribution" + description: "Greenleaf Normalized TSS per sample distribution" + plot_type: "linegraph" + pconfig: + id: "TSS Enrichment" + title: "TSS Enrichment Distribution" + ylab: "Greenleaf Normalized TSS Enrichment" + xlab: "Distance from TSS" + tss_knicking_sites_files: + file_format: "tsv" + section_name: "TSS Score Scatter" + description: "TSS score to TSS with >20 Tn5knicking sites scatter. " + plot_type: "scatter" + pconfig: + id: "TSS_scatter" + title: "TSS_Score_Scatter" + ylab: "TSS score" + xlab: "Number of TSS sites with > 20 Tn5 knick sites" sp: - nrf_stats: - fn: 'NRF_stats.tsv' - frip_stats: - fn: 'FRiP_stats.tsv' - fld_stats_peaks: - fn: 'FLD_stats_peaks.tsv' - fld_stats_details: - fn: 'FLD_stats_fractions_ratios.tsv' - tss_knicking_sites_files: - fn: 'data.tss_nicking_sites.txt' - peak_width_files: - fn: '*.annotated.peak_width_density' - fld_files: - fn: '*.fld.txt' - tss_files: - fn: '*.tss.txt' + nrf_stats: + fn: "NRF_stats.tsv" + frip_stats: + fn: "FRiP_stats.tsv" + fld_stats_peaks: + fn: "FLD_stats_peaks.tsv" + fld_stats_details: + fn: "FLD_stats_fractions_ratios.tsv" + tss_knicking_sites_files: + fn: "data.tss_nicking_sites.txt" + peak_width_files: + fn: "*.annotated.peak_width_density" + fld_files: + fn: "*.fld.txt" + tss_files: + fn: "*.tss.txt" fn_clean_exts: - - '.genrich.narrowPeak.annotation_distribution' - - '.consensus.bed.annotated.peak_width_density' - - '.filt.bam' - - '.preseq' - - '.tss.txt' - - '.fld.txt' - - '.bowtie2.log' - - '.fastq.gz' + - ".genrich.narrowPeak.annotation_distribution" + - ".consensus.bed.annotated.peak_width_density" + - ".filt.bam" + - ".preseq" + - ".tss.txt" + - ".fld.txt" + - ".bowtie2.log" + - ".fastq.gz" diff --git a/docs/css/custom.css b/docs/css/custom.css index 5fcc4ec..c4c8782 100644 --- a/docs/css/custom.css +++ b/docs/css/custom.css @@ -21,7 +21,7 @@ pre { /* Light blue background for notes */ div.admonition.note { background-color: #e3f2fd; - border-left: 5px solid #2196F3; + border-left: 5px solid #2196f3; padding: 10px; margin: 10px 0; border-radius: 5px; @@ -30,7 +30,7 @@ div.admonition.note { /* Light grey background for info */ div.admonition.info { background-color: #f5f5f5; - border-left: 5px solid #9E9E9E; + border-left: 5px solid #9e9e9e; padding: 10px; margin: 10px 0; border-radius: 5px; @@ -39,9 +39,8 @@ div.admonition.info { /* Orange warning box */ div.admonition.warning { background-color: #fff3e0; - border-left: 5px solid #FF9800; + border-left: 5px solid #ff9800; padding: 10px; margin: 10px 0; border-radius: 5px; } - diff --git a/docs/deployment.md b/docs/deployment.md index 339501e..e4967c7 100644 --- a/docs/deployment.md +++ b/docs/deployment.md @@ -2,7 +2,6 @@ To effectively run the ASPEN (ATAC-Seq PipEliNe) on the Biowulf High-Performance Computing (HPC) system, please follow the detailed user guide below: - ## ๐Ÿ› ๏ธ Prerequisites - **Biowulf Account:** Ensure you have an active Biowulf account. @@ -22,7 +21,8 @@ This command adds aspen to your system's PATH, allowing you to execute pipeline > **Note**: If you're operating outside of Biowulf, ensure that dependencies such as snakemake, python, and singularity are installed and accessible in your system's PATH. -### ๐Ÿ“ Create a Sample Manifest +### ๐Ÿ“ Create a Sample Manifest + ASPEN requires a sample manifest file (`samples.tsv`) to identify and organize your input data. This tab-separated file should include the following columns: - `replicateName`: Unique identifier for each replicate. @@ -31,18 +31,20 @@ ASPEN requires a sample manifest file (`samples.tsv`) to identify and organize y - `path_to_R2_fastq`: Absolute path to the Read 2 FASTQ file (required for paired-end data). !!! note - Symlinks for R1 and R2 files will be created in the results directory, named as .R1.fastq.gz and .R2.fastq.gz, respectively. Therefore, original filenames do not need to be altered. +Symlinks for R1 and R2 files will be created in the results directory, named as .R1.fastq.gz and .R2.fastq.gz, respectively. Therefore, original filenames do not need to be altered. !!! note - The `replicateName` is used as a prefix for individual peak calls, while the `sampleName` serves as a prefix for consensus peak calls. +The `replicateName` is used as a prefix for individual peak calls, while the `sampleName` serves as a prefix for consensus peak calls. !!! note - For differential ATAC analysis, create a `contrasts.tsv` file with two columns (Group1 and Group2 ... aka Sample1 and Sample2, without headers) and place it in the output directory after initialization. Ensure each group/sample in the contrast has at least two replicates, as DESeq2 requires this for accurate contrast calculations. +For differential ATAC analysis, create a `contrasts.tsv` file with two columns (Group1 and Group2 ... aka Sample1 and Sample2, without headers) and place it in the output directory after initialization. Ensure each group/sample in the contrast has at least two replicates, as DESeq2 requires this for accurate contrast calculations. ## ๐Ÿƒ Running the ASPEN Pipeline + ASPEN operates through a series of modes to facilitate various stages of the analysis. ### ๐Ÿ—‚๏ธ Initialize the Working Directory + Begin by initializing your working directory, which will house configuration files and results. Replace with your desired output directory path: ```bash @@ -52,10 +54,11 @@ aspen -m=init -w= This command generates a config.yaml and a placeholder `samples.tsv` in the specified directory. Edit these files to reflect your experimental setup, replacing the placeholder `samples.tsv` with your prepared manifest. If performing differential analysis, include the `contrasts.tsv` file at this stage. !!! note - To explore all possible options of the `aspen` command you can either run it without any arguments or run `aspen --help` +To explore all possible options of the `aspen` command you can either run it without any arguments or run `aspen --help` Here is what help looks like: -```bash + +```bash ########################################################################################## @@ -174,7 +177,7 @@ contrasts_fdr_cutoff: 0.05 ### ๐Ÿงฌ Enabling Spike-In Normalization (Optional) -ASPEN supports spike-in normalization, which is useful for controlling technical variability or comparing global shifts in chromatin accessibility across samples. Spike-in reads (e.g., from *Drosophila melanogaster* or *E. coli*) are aligned separately and used to compute normalization factors that are applied to host genome accessibility counts. +ASPEN supports spike-in normalization, which is useful for controlling technical variability or comparing global shifts in chromatin accessibility across samples. Spike-in reads (e.g., from _Drosophila melanogaster_ or _E. coli_) are aligned separately and used to compute normalization factors that are applied to host genome accessibility counts. To enable spike-in normalization, edit the `config.yaml` file that was generated during `init`. You can find it in your output directory (`/config.yaml`). @@ -194,13 +197,13 @@ To activate spike-in normalization: #### ๐Ÿ“ Example Configuration -For *Drosophila* spike-in: +For _Drosophila_ spike-in:
spikein: True
 spikein_genome: "dmelr6.32"
 
-For *E. coli* spike-in: +For _E. coli_ spike-in:
spikein: True
 spikein_genome: "ecoli_k12"
@@ -217,7 +220,6 @@ Once enabled, ASPEN will:
 
 This step is optional but highly recommended when you expect global changes in chromatin accessibility due to treatments or perturbations.
 
-
 ### ๐Ÿ› ๏ธ Dry Run the Pipeline
 
 Before executing the full analysis and after editing the `config.yaml` as needed, perform a dry run to visualize the workflow and identify potential issues:
@@ -225,9 +227,8 @@ Before executing the full analysis and after editing the `config.yaml` as needed
 ```bash
 aspen -m=dryrun -w=
 ```
-This step outlines the sequence of tasks (Directed Acyclic Graph - DAG) without actual execution, allowing you to verify the planned operations.
-
 
+This step outlines the sequence of tasks (Directed Acyclic Graph - DAG) without actual execution, allowing you to verify the planned operations.
 
 ### ๐Ÿš€ Execute the Pipeline
 
@@ -253,7 +254,6 @@ This command runs the pipeline with the specified working directory and Singular
 
 > **Note**: If deploying on Biowulf, try setting the `--singcache` to `/data/CCBR_Pipeliner/SIFS` to reuse the pre-pulled containers and save time.
 
-
 ## ๐Ÿ“Š Monitor ASPEN Runs
 
 To monitor the status of your ASPEN pipeline and its associated jobs on a Slurm-managed system, you can utilize the squeue and scontrol commands. The squeue command provides information about jobs in the scheduling queue, while scontrol offers detailed insights into specific jobs.
@@ -263,6 +263,7 @@ To view all your active and pending jobs, execute:
 ```bash
 squeue -u $USER --format="%.18i %.30j %.11P %.15T %.10r %.10M %.10l %.5D %.5C %.10m %.25b %.8N" --sort=-S
 ```
+
 This command lists all jobs submitted by your user account, displaying details such as job IDs, partitions, job names, user names, job states, and the nodes allocated.
 
 For more granular information about a specific job, including its child jobs spawned by ASPEN, use the scontrol command:
@@ -270,6 +271,7 @@ For more granular information about a specific job, including its child jobs spa
 ```bash
 scontrol show job 
 ```
+
 Replace  with the specific Job ID of interest. This will provide comprehensive details about the job's configuration and status, aiding in effective monitoring and management of your ASPEN pipeline processes.
 
 To quickly guage the process of the entire pipeline run:
@@ -277,4 +279,3 @@ To quickly guage the process of the entire pipeline run:
 ```bash
 grep "done$" /snakemake.log
 ```
-
diff --git a/docs/extra.md b/docs/extra.md
index 7e31d70..48d3b79 100644
--- a/docs/extra.md
+++ b/docs/extra.md
@@ -2,4 +2,4 @@ Once the ROIs are established, ASPEN generates two distinct count matrices:
 
 - **Tn5 Nicking Sites Count Matrix**: This matrix quantifies the frequency of Tn5 transposase insertion events at each ROI. The `filtered.bam` file (with PCR duplicates and not `dedup.bam`) is used for determining Tn5 sites and counting them. The Tn5 transposase preferentially inserts into accessible regions of the chromatin, and the number of insertion events serves as a proxy for chromatin accessibility. By counting these insertion sites, researchers can accurately infer the openness of chromatin regions under different experimental conditions.
 
-- **Read Counts Matrix**: This matrix records the number of sequencing reads mapped to each ROI. The `filtered.bam` file (with PCR duplicates and not `dedup.bam`) is used for counting. While Tn5 nicking sites provide a more direct measure of chromatin accessibility, read counts are included in ASPEN as they have been widely used in recent publications. Analyzing both matrices together offers a comprehensive view of chromatin accessibility dynamics.
\ No newline at end of file
+- **Read Counts Matrix**: This matrix records the number of sequencing reads mapped to each ROI. The `filtered.bam` file (with PCR duplicates and not `dedup.bam`) is used for counting. While Tn5 nicking sites provide a more direct measure of chromatin accessibility, read counts are included in ASPEN as they have been widely used in recent publications. Analyzing both matrices together offers a comprehensive view of chromatin accessibility dynamics.
diff --git a/docs/js/custom.js b/docs/js/custom.js
index 3b38d0a..4376158 100644
--- a/docs/js/custom.js
+++ b/docs/js/custom.js
@@ -1,19 +1,24 @@
-document.addEventListener("DOMContentLoaded", function() {
-    document.querySelectorAll("pre").forEach(function(pre) {
+document.addEventListener("DOMContentLoaded", function () {
+    document.querySelectorAll("pre").forEach(function (pre) {
         let button = document.createElement("button");
         button.className = "copy-button";
         button.innerText = "Copy";
-        
+
         // Ensure button does not get copied
-        button.addEventListener("click", function(event) {
+        button.addEventListener("click", function (event) {
             event.stopPropagation(); // Prevent event bubbling
             let codeBlock = pre.querySelector("code"); // Select only the code block
             if (codeBlock) {
                 let textToCopy = codeBlock.innerText.trim(); // Trim removes trailing newlines
-                navigator.clipboard.writeText(textToCopy).then(() => {
-                    button.innerText = "Copied!";
-                    setTimeout(() => { button.innerText = "Copy"; }, 1500);
-                }).catch(err => console.error("Copy failed", err));
+                navigator.clipboard
+                    .writeText(textToCopy)
+                    .then(() => {
+                        button.innerText = "Copied!";
+                        setTimeout(() => {
+                            button.innerText = "Copy";
+                        }, 1500);
+                    })
+                    .catch((err) => console.error("Copy failed", err));
             }
         });
 
@@ -31,4 +36,4 @@ document.addEventListener("DOMContentLoaded", function() {
 //   gtag('js', new Date());
 
 //   gtag('config', 'G-D3SL9V30KL');
-// 
\ No newline at end of file
+// 
diff --git a/docs/limitations.md b/docs/limitations.md
index d6d8409..393658f 100644
--- a/docs/limitations.md
+++ b/docs/limitations.md
@@ -8,17 +8,17 @@
 
 - **Genomes supported**: Genomes supported is limited to:
 
-| Genome Assembly | Organism        | Scientific Name       |
-|-----------------|-----------------|-----------------------|
-| hg38            | Human           | *Homo sapiens*        |
-| hg19            | Human           | *Homo sapiens*        |
-| mm10            | Mouse           | *Mus musculus*        |
-| mmul10          | Rhesus Monkey   | *Macaca mulatta*      |
-| bosTau9         | Domestic Cattle | *Bos taurus*          |
+| Genome Assembly | Organism        | Scientific Name  |
+| --------------- | --------------- | ---------------- |
+| hg38            | Human           | _Homo sapiens_   |
+| hg19            | Human           | _Homo sapiens_   |
+| mm10            | Mouse           | _Mus musculus_   |
+| mmul10          | Rhesus Monkey   | _Macaca mulatta_ |
+| bosTau9         | Domestic Cattle | _Bos taurus_     |
 
 - **Spike-in genomes supported**: Spike-in genomes supported is limited to:
 
-| Genome Assembly | Organism        | Scientific Name             |
-|-----------------|-----------------|-----------------------------|
-| dmelr6.32       | Fruit Fly       | *Drosophila melanogaster*   |
-| ecoli_k12       | E. coli         | *Escherichia coli*          |
\ No newline at end of file
+| Genome Assembly | Organism  | Scientific Name           |
+| --------------- | --------- | ------------------------- |
+| dmelr6.32       | Fruit Fly | _Drosophila melanogaster_ |
+| ecoli_k12       | E. coli   | _Escherichia coli_        |
diff --git a/docs/log.md b/docs/log.md
index 81992ef..b6acb57 100644
--- a/docs/log.md
+++ b/docs/log.md
@@ -1,24 +1,29 @@
 ## Creation of test dataset
+
 Using the **D4_Meso_iCre_Dox** and **iCre_D0** samples from the ccbr872 mouse dataset:
 
- * Select 2 replicates for each group .. total samples = 4
- * extract readids from the _chr19:10000000-20000000_ region using:
- ```
+- Select 2 replicates for each group .. total samples = 4
+- extract readids from the _chr19:10000000-20000000_ region using:
+
+```
 % samtools view /data/CCBR/projects/ccbr872/atacseq/test/bam/iCre_D0_1.dedup.bam chr19:10000000-20000000|awk -F"\t" '{print $1}' |sort|uniq > iCre_D0_1.chr19.readids &
 % samtools view /data/CCBR/projects/ccbr872/atacseq/test/bam/iCre_D0_2.dedup.bam chr19:10000000-20000000|awk -F"\t" '{print $1}' |sort|uniq > iCre_D0_2.chr19.readids &
 % samtools view /data/CCBR/projects/ccbr872/atacseq/test/bam/D4_Meso_iCre_Dox_1.dedup.bam chr19:10000000-20000000|awk -F"\t" '{print $1}' |sort|uniq > D4_Meso_iCre_Dox_1.chr19.readids &
 % samtools view /data/CCBR/projects/ccbr872/atacseq/test/bam/D4_Meso_iCre_Dox_2.dedup.bam chr19:10000000-20000000|awk -F"\t" '{print $1}' |sort|uniq > D4_Meso_iCre_Dox_2.chr19.readids &
 ```
- * create subsampled fastq files using these readids:
- ```
+
+- create subsampled fastq files using these readids:
+
+```
 % cd /data/CCBR/rawdata/ccbr872/ccbr872_atacseq/fastq
 % python /data/kopardevn/GitRepos/Tools/scripts/filter_fastq_by_readids_highmem_pe.py --infq iCre_D0_1.R1.fastq.gz --infq2 iCre_D0_1.R2.fastq.gz --outfq iCre_D0_1.subset.R1.fastq.gz --outfq2 iCre_D0_1.subset.R2.fastq.gz --readids iCre_D0_1.chr19.readids
 % python /data/kopardevn/GitRepos/Tools/scripts/filter_fastq_by_readids_highmem_pe.py --infq iCre_D0_2.R1.fastq.gz --infq2 iCre_D0_2.R2.fastq.gz --outfq iCre_D0_2.subset.R1.fastq.gz --outfq2 iCre_D0_2.subset.R2.fastq.gz --readids iCre_D0_2.chr19.readids
 % python /data/kopardevn/GitRepos/Tools/scripts/filter_fastq_by_readids_highmem_pe.py --infq D4_Meso_iCre_Dox_1.R1.fastq.gz --infq2 D4_Meso_iCre_Dox_1.R2.fastq.gz --outfq D4_Meso_iCre_Dox_1.subset.R1.fastq.gz --outfq2 D4_Meso_iCre_Dox_1.subset.R2.fastq.gz --readids D4_Meso_iCre_Dox_1.chr19.readids
 % python /data/kopardevn/GitRepos/Tools/scripts/filter_fastq_by_readids_highmem_pe.py --infq D4_Meso_iCre_Dox_2.R1.fastq.gz --infq2 D4_Meso_iCre_Dox_2.R2.fastq.gz --outfq D4_Meso_iCre_Dox_2.subset.R1.fastq.gz --outfq2 D4_Meso_iCre_Dox_2.subset.R2.fastq.gz --readids D4_Meso_iCre_Dox_2.chr19.readids
- ```
+```
 
 Now, the **samples.tsv** will look something like this:
+
 ```
 sampleName	path_to_R1_fastq	path_to_R2_fastq
 D4_Meso_iCre_Dox_1	/data/CCBR/rawdata/ccbr872/ccbr872_atacseq/fastq/D4_Meso_iCre_Dox_1.subset.R1.fastq.paired.fq.gz	/data/CCBR/rawdata/ccbr872/ccbr872_atacseq/fastq/D4_Meso_iCre_Dox_1.subset.R2.fastq.pai
@@ -27,4 +32,4 @@ D4_Meso_iCre_Dox_2	/data/CCBR/rawdata/ccbr872/ccbr872_atacseq/fastq/D4_Meso_iCre
 red.fq.gz
 iCre_D0_1	/data/CCBR/rawdata/ccbr872/ccbr872_atacseq/fastq/iCre_D0_1.subset.R1.fastq.paired.fq.gz	/data/CCBR/rawdata/ccbr872/ccbr872_atacseq/fastq/iCre_D0_1.subset.R2.fastq.paired.fq.gz
 iCre_D0_2	/data/CCBR/rawdata/ccbr872/ccbr872_atacseq/fastq/iCre_D0_2.subset.R1.fastq.paired.fq.gz	/data/CCBR/rawdata/ccbr872/ccbr872_atacseq/fastq/iCre_D0_2.subset.R2.fastq.paired.fq.gz
-```
\ No newline at end of file
+```
diff --git a/docs/outputs.md b/docs/outputs.md
index c1db587..cbbecf5 100644
--- a/docs/outputs.md
+++ b/docs/outputs.md
@@ -39,14 +39,14 @@ Here are more details about these files:
 | `dryrun_git_commit.txt`          | TXT           | dryrun                                                | The git commit hash of the version of ASPEN used at dryrun                                                                                                            |
 | `dryrun.log`                     | TXT           | dryrun                                                | Log from `-m=dryrun`                                                                                                                                                  |
 | `fastqs`                         | FOLDER        | dryrun                                                | Folder containing symlinks to raw data                                                                                                                                |
-| `logs`                           | FOLDER        | dryrun                                                | Folder containing all logs including Slurm `.out` and `.err` files. Also contains older timestamped `runinfo.yaml` and `snakemake.stats` files.                                                                                                    |
+| `logs`                           | FOLDER        | dryrun                                                | Folder containing all logs including Slurm `.out` and `.err` files. Also contains older timestamped `runinfo.yaml` and `snakemake.stats` files.                       |
 | `results`                        | FOLDER        | Created at dryrun but populated during run            | Main outputs folder                                                                                                                                                   |
 | `runinfo.yaml`                   | YAML          | After completion of run                               | Metadata about the run executor, etc.                                                                                                                                 |
 | `runslurm_snakemake_report.html` | HTML          | After completion of run                               | HTML report including DAG and resource utilization                                                                                                                    |
 | `sampleinfo.txt`                 | TXT           | dryrun, run                                           | Tab-delimited mappings between `replicateNames` and `sampleNames`                                                                                                     |
 | `samples.tsv`                    | TSV           | init; can be edited later                             | Tab-delimited manifest with `replicateName`, `sampleName`, `path_to_R1_fastq`, `path_to_R2_fastq`. This file has a header.                                            |
 | `scripts`                        | FOLDER        | init                                                  | Folder keeps local copy of scripts called by various rules                                                                                                            |
-| `run_git_commit.txt`          | TXT           | run                                                | The git commit hash of the version of ASPEN used at run                                                                                                            |
+| `run_git_commit.txt`             | TXT           | run                                                   | The git commit hash of the version of ASPEN used at run                                                                                                               |
 | `slurm-XXXXXXX.out`              | TXT           | run                                                   | Slurm `.out` file for the master job                                                                                                                                  |
 | `snakemake.log`                  | TXT           | run                                                   | Snakemake `.log` file for the master job; older copies timestamped and moved into `logs` folder                                                                       |
 | `snakemake.stats`                | JSON          | run                                                   | per rule runtime stats                                                                                                                                                |
@@ -122,16 +122,14 @@ Content details:
 | visualization | tn5sites_bam        | - Tn5 nicking sites in BAM format. 
- Derived from `filteredBam`. | | visualization | tn5sites_bigwig | - Tn5 nicking sites in BIGWIG format.
- Scaled using spike-in scaling factors if present.
- Derived from `tn5sites_bam`. | - !!! note - BAM files from `dedupBam` can be used for downstream footprinting analysis using [CCBR_TOBIAS](https://github.com/CCBR/CCBR_Tobias) pipeline +BAM files from `dedupBam` can be used for downstream footprinting analysis using [CCBR_TOBIAS](https://github.com/CCBR/CCBR_Tobias) pipeline !!! note - [bamCompare](https://deeptools.readthedocs.io/en/develop/content/tools/bamCompare.html) from deeptools can be run to compare BAMs from `dedupBam` for comprehensive BAM comparisons. +[bamCompare](https://deeptools.readthedocs.io/en/develop/content/tools/bamCompare.html) from deeptools can be run to compare BAMs from `dedupBam` for comprehensive BAM comparisons. !!! note - BAM files from `dedupBam` can also be converted to BED format and processed with [chromVAR](https://github.com/GreenleafLab/chromVAR) to identify variability in motif accessibility across samples and assess differentially active transcription factors from the JASPAR database. - +BAM files from `dedupBam` can also be converted to BED format and processed with [chromVAR](https://github.com/GreenleafLab/chromVAR) to identify variability in motif accessibility across samples and assess differentially active transcription factors from the JASPAR database. #### Peak Annotation folder @@ -306,4 +304,3 @@ This directory contains all .err and .out log files generated by SLURM for jobs This structure is particularly useful for troubleshooting and debugging, especially when the SLURM job IDs of failed jobs are known. By examining the corresponding .err or .out files, users can efficiently identify the source of errors within specific Snakemake rules and wildcards. > DISCLAIMER: This folder hierarchy is significantly different than v1.0.6 and is subject to change with subsequent versions. - diff --git a/docs/overview.md b/docs/overview.md index 54fcc22..31d6449 100644 --- a/docs/overview.md +++ b/docs/overview.md @@ -4,14 +4,13 @@ To address [these](introduction.md#challenges-in-atac-seq-data-analysis) challen ## **Flowchart** -The flowchart below provides a visual summary of the ASPEN pipeline, illustrating the key steps involved in preprocessing, alignment, peak calling, and downstream analyses. +The flowchart below provides a visual summary of the ASPEN pipeline, illustrating the key steps involved in preprocessing, alignment, peak calling, and downstream analyses. ![alt text](assets/images/flowchart.png) - ## ๐Ÿ› ๏ธ **Data Preprocessing** -### โœ‚๏ธ **Adapter Trimming** +### โœ‚๏ธ **Adapter Trimming** Utilizes CutAdapt to remove adapter sequences from raw reads, ensuring that subsequent analyses are not confounded by extraneous sequences. @@ -40,7 +39,7 @@ This alignment process ensures that only high-confidence reads are retained, pro ### ๐Ÿ” **Peak Detection** -ASPEN employs both MACS2 and Genrich for the identification of regions of open chromatin, ensuring comprehensive detection of regulatory elements. +ASPEN employs both MACS2 and Genrich for the identification of regions of open chromatin, ensuring comprehensive detection of regulatory elements. #### **MACS2 Peak Calling** @@ -52,7 +51,7 @@ Genrich is integrated into the pipeline to complement MACS2. This tool is partic By combining the strengths of MACS2 and Genrich, ASPEN delivers a comprehensive and reliable peak detection framework, facilitating downstream analyses and enabling researchers to uncover critical insights into chromatin accessibility and gene regulation. -### ๐Ÿค **Consensus Peaks** +### ๐Ÿค **Consensus Peaks** For datasets with multiple replicates, ASPEN employs several strategies to derive consensus peaks across replicates: @@ -60,11 +59,10 @@ For datasets with multiple replicates, ASPEN employs several strategies to deriv - **`pooled.narrowPeak`**: In this approach, reads from all replicates are pooled together, and peak calling is performed on the combined dataset to generate a unified set of "pooled" peaks. -- **`fixed_width.consensus.narrowPeak`**: This method calculates consensus peaks by renormalizing p-value scores across replicates, following the approach outlined by [_Corces et al._](https://doi.org/10.1038/nmeth.4396). +- **`fixed_width.consensus.narrowPeak`**: This method calculates consensus peaks by renormalizing p-value scores across replicates, following the approach outlined by [_Corces et al._](https://doi.org/10.1038/nmeth.4396). Among these methods, the fixed-width consensus peak strategy is recommended for its robustness and reproducibility. The other outputs are provided primarily for exploratory purposes. - ### ๐Ÿท๏ธ **Peak Annotation** ASPEN integrates ChIPseeker to perform comprehensive annotation of identified peaks, offering valuable insights into their genomic context and potential regulatory functions. This process involves associating peaks with specific genomic features, such as promoters, enhancers, gene bodies, or intergenic regions. By providing detailed annotations, ASPEN facilitates the interpretation of chromatin accessibility data, enabling researchers to uncover the functional significance of these regions in gene regulation and other biological processes. @@ -105,11 +103,11 @@ The Fraction of Reads in Peaks (FRiP) score quantifies the proportion of sequenc ### ๐Ÿ”‘ **HOMER and AME** -ASPEN integrates motif enrichment analysis tools, including HOMER and AME, to identify transcription factor binding motifs within accessible chromatin regions. This analysis provides valuable insights into the regulatory mechanisms governing gene expression. By uncovering enriched motifs, researchers can infer the potential transcription factors driving chromatin accessibility changes and identify key regulators of cellular processes. The combination of HOMER and AME ensures a comprehensive and robust approach to motif discovery, facilitating the interpretation of ATAC-seq data in the context of gene regulation and epigenetic control. HOCOMOCO v11 motifs, which are bundled as resources with ASPEN, are used for both motif enrichment tools. +ASPEN integrates motif enrichment analysis tools, including HOMER and AME, to identify transcription factor binding motifs within accessible chromatin regions. This analysis provides valuable insights into the regulatory mechanisms governing gene expression. By uncovering enriched motifs, researchers can infer the potential transcription factors driving chromatin accessibility changes and identify key regulators of cellular processes. The combination of HOMER and AME ensures a comprehensive and robust approach to motif discovery, facilitating the interpretation of ATAC-seq data in the context of gene regulation and epigenetic control. HOCOMOCO v11 motifs, which are bundled as resources with ASPEN, are used for both motif enrichment tools. ## ๐Ÿ“œ **Comprehensive Reporting** -### ๐Ÿ“Š **MultiQC Integration** +### ๐Ÿ“Š **MultiQC Integration** Generates a comprehensive and interactive HTML report that consolidates all quality control metrics, analysis results, and visualizations into a single, user-friendly document. This report is designed to facilitate the interpretation of results, provide clear and actionable insights, and enable seamless sharing of findings with collaborators. By integrating diverse outputs into an organized and visually appealing format, the HTML report ensures that researchers can efficiently explore their data and communicate their discoveries effectively. @@ -147,7 +145,7 @@ To enhance the biological interpretation of differential accessibility results, To account for potential global shifts in chromatin accessibilityโ€”particularly in perturbation studies where widespread chromatin compaction or relaxation may occurโ€”ASPEN optionally supports **spike-in normalization**. -Spike-in normalization involves the use of exogenous DNA or cells from a different species (e.g., *Drosophila melanogaster* or *E. coli*) that are added in equal proportions across all experimental samples prior to lysis and tagmentation. These spike-in reads serve as an internal control to correct for technical variation and global accessibility shifts that may not be captured by traditional normalization strategies. +Spike-in normalization involves the use of exogenous DNA or cells from a different species (e.g., _Drosophila melanogaster_ or _E. coli_) that are added in equal proportions across all experimental samples prior to lysis and tagmentation. These spike-in reads serve as an internal control to correct for technical variation and global accessibility shifts that may not be captured by traditional normalization strategies. In ASPEN, if spike-in data is present: @@ -156,6 +154,7 @@ In ASPEN, if spike-in data is present: - A normalization factor is calculated based on spike-in counts and applied to the accessibility read counts from the host genome. This spike-in-derived scaling factor allows the comparison of chromatin accessibility across conditions even when global chromatin accessibility levels differ (e.g., treatment-induced repression or global decondensation). This method is particularly valuable in experiments involving: + - Transcription factor knockdowns/knockouts - Chromatin remodeler inhibition - Drug-induced chromatin modulation diff --git a/flowcharts/README.md b/flowcharts/README.md index a943cf0..d63c329 100644 --- a/flowcharts/README.md +++ b/flowcharts/README.md @@ -4,9 +4,7 @@ Tools from numerous open source ATACseq data analysis are composed together to c #### Overview - ![img](https://i.loli.net/2021/11/16/a9yOfZdrMhLEGmD.png) - - +![img](https://i.loli.net/2021/11/16/a9yOfZdrMhLEGmD.png) #### TAD or Trimming-Alignment-Deduplication @@ -18,16 +16,16 @@ Tools from numerous open source ATACseq data analysis are composed together to c #### TSS (transcript-start-site) enrichment - ![img](https://i.loli.net/2021/11/16/wZvSNEdBieCbncR.png) +![img](https://i.loli.net/2021/11/16/wZvSNEdBieCbncR.png) #### FRiP (fraction of reads in peaks) -#### ![img](https://i.loli.net/2021/11/16/YjeQlXFD5tGgL4k.png) +#### ![img](https://i.loli.net/2021/11/16/YjeQlXFD5tGgL4k.png) #### Motif enrichment - ![img](https://i.loli.net/2021/11/16/ikDREMzPtf9WNLH.png) +![img](https://i.loli.net/2021/11/16/ikDREMzPtf9WNLH.png) #### Jaccard and ChIPSeeker - ![img](https://i.loli.net/2021/11/16/o7zlj3nbmR6vqH2.png) \ No newline at end of file +![img](https://i.loli.net/2021/11/16/o7zlj3nbmR6vqH2.png) diff --git a/resources/README.md b/resources/README.md index 8fe23b9..7b2d4dc 100644 --- a/resources/README.md +++ b/resources/README.md @@ -1,12 +1,12 @@ ## Resources This folder, `resources/`, is meant to contain all resources (except larger genome indexes) necessary for running the workflow. The subfolders are: -* **blacklistFa**: Blacklist BED files are downloaded for human (v3) and mouse (v1). Using the coordinates from these BED files, fasta sequences are extracted for hg19/39 and mm10. -* **frip**: BED files for DHS/enhancer/promoter locations for hg19/39 and mm10 genomes. -* **motif**: HOCOMOCO v11 motifs in HOMER and MEME formats for human and mouse. -* **tssBed**: BED files pin-pointing the transcription start sites of annotated genes for hg19/38 and mm10. +- **blacklistFa**: Blacklist BED files are downloaded for human (v3) and mouse (v1). Using the coordinates from these BED files, fasta sequences are extracted for hg19/39 and mm10. +- **frip**: BED files for DHS/enhancer/promoter locations for hg19/39 and mm10 genomes. +- **motif**: HOCOMOCO v11 motifs in HOMER and MEME formats for human and mouse. +- **tssBed**: BED files pin-pointing the transcription start sites of annotated genes for hg19/38 and mm10. `cluster.json` file can make specific resource requests to biowulf via slurm. Each Snakemake rule can make a unique hardware request for slurm execution. -`tools.yaml` file would typically contain the modules required to be loaded for rule execution on the Biowulf cluster. Since we are using dockers for all rules in this pipeline, `tools.yaml` will be empty. \ No newline at end of file +`tools.yaml` file would typically contain the modules required to be loaded for rule execution on the Biowulf cluster. Since we are using dockers for all rules in this pipeline, `tools.yaml` will be empty. diff --git a/resources/blacklistFa/create_blacklist_indexes.sh b/resources/blacklistFa/create_blacklist_indexes.sh index c2e4744..158bfac 100755 --- a/resources/blacklistFa/create_blacklist_indexes.sh +++ b/resources/blacklistFa/create_blacklist_indexes.sh @@ -9,8 +9,8 @@ # fasta files are created using bedtools getfasta on biowulf and then transferred here # chrM and chr_rDNA are added to blacklist fasta to create # 1.mm10.blacklist.chrM.chr_rDNA.fa -# 2.hg19.blacklist_v3.chrM.chr_rDNA.fa -# 3.hg38.blacklist_v3.chrM.chr_rDNA.fa +# 2.hg19.blacklist_v3.chrM.chr_rDNA.fa +# 3.hg38.blacklist_v3.chrM.chr_rDNA.fa # create index bwa index -p mm10_blacklist mm10.blacklist.chrM.chr_rDNA.fa diff --git a/resources/chroms/bosTau9.chroms b/resources/chroms/bosTau9.chroms index e43f4c1..15e83f2 100644 --- a/resources/chroms/bosTau9.chroms +++ b/resources/chroms/bosTau9.chroms @@ -1 +1 @@ -chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chr23 chr24 chr25 chr26 chr27 chr28 chr29 chrX \ No newline at end of file +chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chr23 chr24 chr25 chr26 chr27 chr28 chr29 chrX diff --git a/resources/chroms/hg19.chroms b/resources/chroms/hg19.chroms index 0447c90..1f297d4 100644 --- a/resources/chroms/hg19.chroms +++ b/resources/chroms/hg19.chroms @@ -1 +1 @@ -chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY \ No newline at end of file +chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY diff --git a/resources/chroms/hg38.chroms b/resources/chroms/hg38.chroms index 0447c90..1f297d4 100644 --- a/resources/chroms/hg38.chroms +++ b/resources/chroms/hg38.chroms @@ -1 +1 @@ -chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY \ No newline at end of file +chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY diff --git a/resources/chroms/hs1.chroms b/resources/chroms/hs1.chroms index 0447c90..1f297d4 100644 --- a/resources/chroms/hs1.chroms +++ b/resources/chroms/hs1.chroms @@ -1 +1 @@ -chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY \ No newline at end of file +chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY diff --git a/resources/chroms/mm10.chroms b/resources/chroms/mm10.chroms index 5205b92..9942210 100644 --- a/resources/chroms/mm10.chroms +++ b/resources/chroms/mm10.chroms @@ -1 +1 @@ -chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chrX chrY \ No newline at end of file +chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chrX chrY diff --git a/resources/chroms/mmul10.chroms b/resources/chroms/mmul10.chroms index fed8303..656ffcb 100644 --- a/resources/chroms/mmul10.chroms +++ b/resources/chroms/mmul10.chroms @@ -1 +1 @@ -chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chrX chrY \ No newline at end of file +chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chrX chrY diff --git a/resources/cluster.json b/resources/cluster.json index 182c154..7f18f4d 100755 --- a/resources/cluster.json +++ b/resources/cluster.json @@ -1,111 +1,112 @@ { - "__default__": { - "gres": "lscratch:256", - "mem": "48g", - "partition": "norm,ccr", "qos": "global", - "threads": "2", - "time": "2-00:00:00", - "name" : "{rule}.{wildcards}", - "output" : "logs/${{SLURM_JOBID}}.%j.{rule}.{wildcards}.out", - "error" : "logs/${{SLURM_JOBID}}.%j.{rule}.{wildcards}.err" - }, - "trim_align_dedup": { - "mem": "120g", - "threads": "16", - "time": "12:00:00" - }, - "trim": { - "mem": "120g", - "threads": "16", - "time": "12:00:00" - }, - "remove_BL": { - "mem": "120g", - "threads": "16", - "time": "12:00:00" - }, - "create_bigwigs": { - "mem": "120g", - "threads": "16", - "time": "12:00:00" - }, - "align": { - "mem": "240g", - "threads": "16", - "time": "12:00:00" - }, - "align2spikein": { - "mem": "240g", - "threads": "16", - "time": "12:00:00" - }, - "fastqc": { - "mem": "120g", - "threads": "16", - "time": "12:00:00" - }, - "atac_tss": { - "mem": "120g", - "threads": "16", - "time": "12:00:00" - }, - "motif_enrichment": { - "mem": "120g", - "threads": "16", - "time": "24:00:00" - }, - "create_tn5bams": { - "mem": "120g", - "threads": "16", - "time": "12:00:00" - }, - "jaccard": { - "mem": "40g", - "threads": "16", - "time": "02-00:00:00" - }, - "create_BL_index": { - "time": "1:00:00" - }, - "fastq_screen": { - "mem": "120g", - "threads": "16", - "time": "12:00:00" - }, - "atac_macs_peakcalling": { - "mem": "120g", - "threads": "4", - "time": "12:00:00" - }, - "atac_genrich_peakcalling": { - "mem": "120g", - "threads": "4", - "time": "12:00:00" - }, - "atac_macs_peakcalling_fixed_width": { - "mem": "120g", - "threads": "4", - "time": "12:00:00" - }, - "atac_genrich_peakcalling_fixed_width": { - "mem": "120g", - "threads": "4", - "time": "12:00:00" - }, - "atac_genrich_calculate_regions_of_interest_for_diffatac": { - "mem": "120g", - "threads": "4", - "time": "12:00:00" - }, - "get_counts_table": { - "threads": "16", - "mem": "120g", - "time": "12:00:00" - }, - "frip": { - "time": "12:00:00" - }, - "multiqc": { - "time": "4:00:00" - } + "__default__": { + "gres": "lscratch:256", + "mem": "48g", + "partition": "norm,ccr", + "qos": "global", + "threads": "2", + "time": "2-00:00:00", + "name": "{rule}.{wildcards}", + "output": "logs/${{SLURM_JOBID}}.%j.{rule}.{wildcards}.out", + "error": "logs/${{SLURM_JOBID}}.%j.{rule}.{wildcards}.err" + }, + "trim_align_dedup": { + "mem": "120g", + "threads": "16", + "time": "12:00:00" + }, + "trim": { + "mem": "120g", + "threads": "16", + "time": "12:00:00" + }, + "remove_BL": { + "mem": "120g", + "threads": "16", + "time": "12:00:00" + }, + "create_bigwigs": { + "mem": "120g", + "threads": "16", + "time": "12:00:00" + }, + "align": { + "mem": "240g", + "threads": "16", + "time": "12:00:00" + }, + "align2spikein": { + "mem": "240g", + "threads": "16", + "time": "12:00:00" + }, + "fastqc": { + "mem": "120g", + "threads": "16", + "time": "12:00:00" + }, + "atac_tss": { + "mem": "120g", + "threads": "16", + "time": "12:00:00" + }, + "motif_enrichment": { + "mem": "120g", + "threads": "16", + "time": "24:00:00" + }, + "create_tn5bams": { + "mem": "120g", + "threads": "16", + "time": "12:00:00" + }, + "jaccard": { + "mem": "40g", + "threads": "16", + "time": "02-00:00:00" + }, + "create_BL_index": { + "time": "1:00:00" + }, + "fastq_screen": { + "mem": "120g", + "threads": "16", + "time": "12:00:00" + }, + "atac_macs_peakcalling": { + "mem": "120g", + "threads": "4", + "time": "12:00:00" + }, + "atac_genrich_peakcalling": { + "mem": "120g", + "threads": "4", + "time": "12:00:00" + }, + "atac_macs_peakcalling_fixed_width": { + "mem": "120g", + "threads": "4", + "time": "12:00:00" + }, + "atac_genrich_peakcalling_fixed_width": { + "mem": "120g", + "threads": "4", + "time": "12:00:00" + }, + "atac_genrich_calculate_regions_of_interest_for_diffatac": { + "mem": "120g", + "threads": "4", + "time": "12:00:00" + }, + "get_counts_table": { + "threads": "16", + "mem": "120g", + "time": "12:00:00" + }, + "frip": { + "time": "12:00:00" + }, + "multiqc": { + "time": "4:00:00" + } } diff --git a/resources/frip/README.md b/resources/frip/README.md index a339fe8..e716d39 100644 --- a/resources/frip/README.md +++ b/resources/frip/README.md @@ -1,15 +1,18 @@ # Promoter regions + ### biowulf location: /data/CCBR_Pipeliner/db/PipeDB/db/Promoters Definition: -* GTF file with "gene" in column 3 and "protein_coding" in description -* 2kb upstream from TSS and 200 bp downstream + +- GTF file with "gene" in column 3 and "protein_coding" in description +- 2kb upstream from TSS and 200 bp downstream GTF versions: -* Gencode version 42 for hg38 -* Gencode version 42 for hg19 -* Gencode version M1 for mm9 -* Gencode version M25 for mm10 + +- Gencode version 42 for hg38 +- Gencode version 42 for hg19 +- Gencode version M1 for mm9 +- Gencode version M25 for mm10 Use "create_promoters_bed.py" to generate BEDs @@ -26,30 +29,38 @@ optional arguments: ``` # DNAase Hypersensitivity Regions (DHS) + ### biowulf location: /data/CCBR_Pipeliner/db/PipeDB/db/DHS/ENCODE -* mm9 and hg19 versions are downloaded from ENCODE (source: https://www.encodeproject.org/search/?type=Annotation&encyclopedia_version=4&annotation_type=representative+DNase+hypersensitivity+sites) -* overlapping regions are collaped using "bedtools merge" -* mm10 and hg38 versions are then crossmapped from these collapsed versions from the mm10 and hg19 versions, respectively. -# Enhancers +- mm9 and hg19 versions are downloaded from ENCODE (source: https://www.encodeproject.org/search/?type=Annotation&encyclopedia_version=4&annotation_type=representative+DNase+hypersensitivity+sites) +- overlapping regions are collaped using "bedtools merge" +- mm10 and hg38 versions are then crossmapped from these collapsed versions from the mm10 and hg19 versions, respectively. + +# Enhancers + ### biowulf location: /data/CCBR_Pipeliner/db/PipeDB/db/Enhancers/merged + Sources: -* HACER - * hg19 : http://bioinfo.vanderbilt.edu/AE/HACER/download/T1.txt +- HACER + + - hg19 : http://bioinfo.vanderbilt.edu/AE/HACER/download/T1.txt + +- FANTOM + + - hg38 : http://fantom.gsc.riken.jp/5/datafiles/reprocessed/hg38_latest/extra/enhancer/F5.hg38.enhancers.bed.gz + - hg19 : http://fantom.gsc.riken.jp/5/datafiles/latest/extra/Enhancers/human_permissive_enhancers_phase_1_and_2.bed.gz + - mm9 : http://fantom.gsc.riken.jp/5/datafiles/latest/extra/Enhancers/mouse_permissive_enhancers_phase_1_and_2.bed.gz + - mm10 : http://fantom.gsc.riken.jp/5/datafiles/reprocessed/mm10_latest/extra/enhancer/F5.mm10.enhancers.bed.gz + +- DBSuper -* FANTOM - * hg38 : http://fantom.gsc.riken.jp/5/datafiles/reprocessed/hg38_latest/extra/enhancer/F5.hg38.enhancers.bed.gz - * hg19 : http://fantom.gsc.riken.jp/5/datafiles/latest/extra/Enhancers/human_permissive_enhancers_phase_1_and_2.bed.gz - * mm9 : http://fantom.gsc.riken.jp/5/datafiles/latest/extra/Enhancers/mouse_permissive_enhancers_phase_1_and_2.bed.gz - * mm10 : http://fantom.gsc.riken.jp/5/datafiles/reprocessed/mm10_latest/extra/enhancer/F5.mm10.enhancers.bed.gz + - mm9 : http://asntech.org/dbsuper/data/bed/mm9/all_mm9_bed.bed + - hg19 : http://asntech.org/dbsuper/data/bed/hg19/all_hg19_bed.bed -* DBSuper - * mm9 : http://asntech.org/dbsuper/data/bed/mm9/all_mm9_bed.bed - * hg19 : http://asntech.org/dbsuper/data/bed/hg19/all_hg19_bed.bed +- EnhancerAtlas: + - fasta files were downloaded using the following commands for hg19 and mm9 data: -* EnhancerAtlas: - * fasta files were downloaded using the following commands for hg19 and mm9 data: ``` % for i in 3T3-L1 416B AtT-20 BAT Bone_marrow Brain_E14.5 Brown_preadipocyte_E18.5 C3H10Thalf CD19+ CD4+CD8+ CD4+Treg CD4+ CD8+ Cerebellum Cerebellum_neonate CH12 CMP Cortex Dendritic_cell EpiLC EpiSC Erythroid_fetal_liver Erythroid_spleen ESC_Bruce4 ESC_J1 ESC_KH2 ESC_NPC Forebrain_E11.5 Forebrain_E12.5 Forelimb_bud_embryo Forelimb_E11 Forelimb_E13 G1E-ER4 G1E GMP Heart_E11.5 Heart_E12.5 Heart_E14.5 Heart Hepatocyte HFSC Hindbrain_E11.5 Intestine IPSC Kidney Large_intestine_epithelial Lens_P1 Limb_E11.5 Limb_E14.5 Liver_E14.5 Liver Lung_E14.5 Lung Lung_neonate MC3T3-E1 Megakaryocyte MEL Microglia Midbrain_E11 Neuron_cortical NIH-3T3 NKC_spleen NKT NPC Olfactory_bulb Pancreas Pancreatic_islet PDC_BM PDC Peritoneal_macrophage Placenta Pre-B Pre-pro-B Pro-B_BM Prostate Rib_chondrocyte_P1 Spermatid Spleen Stomach_neonate Striatum Testis Th1 Th2 Thymus Treg_cell Uterus V6.5 WAT ZHBTc4;do wget http://www.enhanceratlas.org/data/AllEPs/mm/${i}_EP.txt @@ -57,11 +68,11 @@ done % for f in A549 BJ Caco-2 CD4+ CD8+ CD14+ CD19+ CD20+ CD34+ CD36+ CD133+ CMK CUTLL1 ECC-1 GM10847 GM12878 GM12891 GM12892 GM18486 GM18505 GM18507 GM18508 GM18516 GM18522 GM18526 GM18951 GM19099 GM19141 GM19193 GM19238 GM19239 GM19240 H1 H9 H54 H128 H2171 HCT116 HEK293 HEK293T Hela Hela-S3 HepG2 HL-60 HMEC HSMM HUES64 HUVEC IMR90 Jurkat K562 Kasumi-1 LNCaP LoVo LS174T MCF-7 MCF10A ME-1 MM1S NB4 NH-A NHDF NHEK NHLF NT2-D1 OCI-Ly1 P493-6 PANC-1 Raji SK-N-MC SK-N-SH T47D th1 U2OS U87 VCaP;do echo $f;wget http://www.enhanceratlas.org/data/enhseq/${f}.fasta;done ``` - * chromsome, start, end info is extracted from the fasta file sequence ids and saved as bed files for hg19 and mm9 +- chromsome, start, end info is extracted from the fasta file sequence ids and saved as bed files for hg19 and mm9 Processing: - * overlapping regions are collaped using "bedtools merge" for each source - * if a version is not available from a particular source for a particular genome version then it is created using the collapsed regions file from previous step - * collapsed regions files from all four sources, HACER/FANTOM/DBSuper/EnhancedAtlas are concatenated using "cat" and collaped again using "bedtools merge" +- overlapping regions are collaped using "bedtools merge" for each source +- if a version is not available from a particular source for a particular genome version then it is created using the collapsed regions file from previous step +- collapsed regions files from all four sources, HACER/FANTOM/DBSuper/EnhancedAtlas are concatenated using "cat" and collaped again using "bedtools merge" diff --git a/resources/frip/create_promoters_bed.py b/resources/frip/create_promoters_bed.py index f5292ab..1b31f18 100755 --- a/resources/frip/create_promoters_bed.py +++ b/resources/frip/create_promoters_bed.py @@ -1,78 +1,100 @@ #!/usr/bin/env python3 -import pysam,sys,os,numpy,subprocess +import pysam, sys, os, numpy, subprocess import argparse -import tempfile,shutil +import tempfile, shutil + + +def _get_gene_metadata(l, what): + for i, j in enumerate(l): + if j == what: + return l[i + 1].replace(";", "").replace('"', "") + else: + return "" -def _get_gene_metadata(l,what): - for i,j in enumerate(l): - if j == what: - return l[i+1].replace(';','').replace('"','') - else: - return '' def get_gene_metadata(l): - metadata_str_list = l[8].strip().split() - geneID = _get_gene_metadata(metadata_str_list,'gene_id') - geneName = _get_gene_metadata(metadata_str_list,'gene_name') - geneType = '' - geneType = _get_gene_metadata(metadata_str_list,'gene_type') - if geneType == '': - geneType = _get_gene_metadata(metadata_str_list,'gene_biotype') - chrom = l[0] - strand = l[6] - start = int(l[3])-1 - end = int(l[4])-1 - if strand == '-': - tmp = start - start = end - end = tmp - score = '.' - return geneID, geneName, geneType, chrom, start, strand + metadata_str_list = l[8].strip().split() + geneID = _get_gene_metadata(metadata_str_list, "gene_id") + geneName = _get_gene_metadata(metadata_str_list, "gene_name") + geneType = "" + geneType = _get_gene_metadata(metadata_str_list, "gene_type") + if geneType == "": + geneType = _get_gene_metadata(metadata_str_list, "gene_biotype") + chrom = l[0] + strand = l[6] + start = int(l[3]) - 1 + end = int(l[4]) - 1 + if strand == "-": + tmp = start + start = end + end = tmp + score = "." + return geneID, geneName, geneType, chrom, start, strand -parser = argparse.ArgumentParser(description='Create windows around TSS for each gene in GTF file') -#parser.add_argument('--gtf',required=True,help='GTF',type=argparse.FileType('r')) -parser.add_argument('--gtf',required=True,help='GTF file path') -parser.add_argument('--out',required=True,help='Output BED file') +parser = argparse.ArgumentParser( + description="Create windows around TSS for each gene in GTF file" +) +# parser.add_argument('--gtf',required=True,help='GTF',type=argparse.FileType('r')) +parser.add_argument("--gtf", required=True, help="GTF file path") +parser.add_argument("--out", required=True, help="Output BED file") args = parser.parse_args() -gtfFile = open(args.gtf,'r') +gtfFile = open(args.gtf, "r") genes_dict = dict() gtfLines = gtfFile.readlines() gtfFile.close() -#chroms=['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', -# 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', -# 'chr21', 'chr22', 'chrX', 'chrY', 'chrM', 'chrMT'] +# chroms=['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', +# 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', +# 'chr21', 'chr22', 'chrX', 'chrY', 'chrM', 'chrMT'] for l in gtfLines: - l = l.strip() - if not l.startswith('#'): - l = l.split("\t") - if l[2] == "gene": - geneID, geneName, geneType, chrom, start, strand = get_gene_metadata(l) - if geneName == '': - geneName = geneID - if geneType == "protein_coding": - genes_dict[geneID] = dict() - genes_dict[geneID]['geneName'] = geneName - genes_dict[geneID]['start'] = start - genes_dict[geneID]['strand'] = strand - genes_dict[geneID]['chrom'] = chrom + l = l.strip() + if not l.startswith("#"): + l = l.split("\t") + if l[2] == "gene": + geneID, geneName, geneType, chrom, start, strand = get_gene_metadata(l) + if geneName == "": + geneName = geneID + if geneType == "protein_coding": + genes_dict[geneID] = dict() + genes_dict[geneID]["geneName"] = geneName + genes_dict[geneID]["start"] = start + genes_dict[geneID]["strand"] = strand + genes_dict[geneID]["chrom"] = chrom -#chr1 67091 69291 chr1-67091:69291|ENSG00000186092.4|OR4F5 0 + -outBED = open(args.out,'w') -for geneID,metaData in genes_dict.items(): - if metaData['strand']=='-': - s = metaData['start'] - 200 - e = metaData['start'] + 2000 - else: - s = metaData['start'] - 2000 - e = metaData['start'] + 200 - if s <= 0: continue - regionName = metaData['chrom'] + ":" + str(s) + "-" + str(e) + "|" + geneID + "|" + metaData['geneName'] - outList = [metaData['chrom'],str(s),str(e),regionName,str(0),metaData['strand']] - outStr = "\t".join(outList) - outBED.write(outStr+'\n') +# chr1 67091 69291 chr1-67091:69291|ENSG00000186092.4|OR4F5 0 + +outBED = open(args.out, "w") +for geneID, metaData in genes_dict.items(): + if metaData["strand"] == "-": + s = metaData["start"] - 200 + e = metaData["start"] + 2000 + else: + s = metaData["start"] - 2000 + e = metaData["start"] + 200 + if s <= 0: + continue + regionName = ( + metaData["chrom"] + + ":" + + str(s) + + "-" + + str(e) + + "|" + + geneID + + "|" + + metaData["geneName"] + ) + outList = [ + metaData["chrom"], + str(s), + str(e), + regionName, + str(0), + metaData["strand"], + ] + outStr = "\t".join(outList) + outBED.write(outStr + "\n") outBED.close() diff --git a/resources/motif/HOCOMOCOv11_full_HUMAN_MOUSE_mono_homer_format_0.001.motif b/resources/motif/HOCOMOCOv11_full_HUMAN_MOUSE_mono_homer_format_0.001.motif index c4b73eb..d35d122 100644 --- a/resources/motif/HOCOMOCOv11_full_HUMAN_MOUSE_mono_homer_format_0.001.motif +++ b/resources/motif/HOCOMOCOv11_full_HUMAN_MOUSE_mono_homer_format_0.001.motif @@ -18753,4 +18753,4 @@ 0.08924163580302721 0.0061118353057332785 0.896089699453233 0.00855682943800663 0.24083127200397494 0.4877756793635834 0.199266371755328 0.07212667687711374 0.5831304505222441 0.04767673555438024 0.16992644216804778 0.199266371755328 -0.06234670034802033 0.4070908729985628 0.17481643043259446 0.3557459962208224 \ No newline at end of file +0.06234670034802033 0.4070908729985628 0.17481643043259446 0.3557459962208224 diff --git a/resources/motif/HOCOMOCOv11_full_HUMAN_mono_homer_format_0.001.motif b/resources/motif/HOCOMOCOv11_full_HUMAN_mono_homer_format_0.001.motif index abd5da7..c7a46f2 100644 --- a/resources/motif/HOCOMOCOv11_full_HUMAN_mono_homer_format_0.001.motif +++ b/resources/motif/HOCOMOCOv11_full_HUMAN_mono_homer_format_0.001.motif @@ -11302,4 +11302,4 @@ 0.0015159934114744613 0.9885735697562056 0.0015159934114744613 0.00839444342084541 0.9857791994398987 0.002160848099852988 0.009039298109223937 0.0030206543510243565 0.17734637177601936 0.46452165966725645 0.16874830926430567 0.18938365929241852 -0.23839261560918654 0.18508462803656167 0.09222555291005385 0.48429720344419797 \ No newline at end of file +0.23839261560918654 0.18508462803656167 0.09222555291005385 0.48429720344419797 diff --git a/resources/motif/HOCOMOCOv11_full_MOUSE_mono_homer_format_0.001.motif b/resources/motif/HOCOMOCOv11_full_MOUSE_mono_homer_format_0.001.motif index 4702276..bb89a27 100644 --- a/resources/motif/HOCOMOCOv11_full_MOUSE_mono_homer_format_0.001.motif +++ b/resources/motif/HOCOMOCOv11_full_MOUSE_mono_homer_format_0.001.motif @@ -7449,4 +7449,4 @@ 0.08924163580302721 0.0061118353057332785 0.896089699453233 0.00855682943800663 0.24083127200397494 0.4877756793635834 0.199266371755328 0.07212667687711374 0.5831304505222441 0.04767673555438024 0.16992644216804778 0.199266371755328 -0.06234670034802033 0.4070908729985628 0.17481643043259446 0.3557459962208224 \ No newline at end of file +0.06234670034802033 0.4070908729985628 0.17481643043259446 0.3557459962208224 diff --git a/resources/motif/README.md b/resources/motif/README.md index e790617..bc66706 100644 --- a/resources/motif/README.md +++ b/resources/motif/README.md @@ -1,7 +1,7 @@ ## MOTIFS + ### HOCOMOCO: Motifs were downloaded from the HOCOMOCO database (version 11) from https://hocomoco11.autosome.org/downloads_v11 in meme and homer (P-value 0.0001) formats. Also created combined HUMAN + MOUSE motif files. These can be used for other species like Macaca. - diff --git a/resources/tools.yaml b/resources/tools.yaml index e429bd4..95f3955 100644 --- a/resources/tools.yaml +++ b/resources/tools.yaml @@ -1,2 +1,2 @@ fastqscreen: - version: "fastq_screen/0.14.1" \ No newline at end of file + version: "fastq_screen/0.14.1" diff --git a/resources/tools.yaml.old b/resources/tools.yaml.old index 25fb0f2..7877c71 100644 --- a/resources/tools.yaml.old +++ b/resources/tools.yaml.old @@ -26,4 +26,4 @@ star: stringtie: version: "stringtie/2.1.4" ucsc: - version: "ucsc/407" \ No newline at end of file + version: "ucsc/407" diff --git a/resources/tssBed/README.md b/resources/tssBed/README.md index 21337c9..be1c796 100644 --- a/resources/tssBed/README.md +++ b/resources/tssBed/README.md @@ -1,10 +1,13 @@ # Promoter regions + ### BIOWULF location for GTFs: /data/CCBR_Pipeliner/db/PipeDB/Indices/GTFs/ + ### script: ./create_per_gene_TSS_bins.py creates the required tssbed tarball. It: + - reads "gene" lines from the GTF - is strand aware - extracts chrom, start, geneid, geneName from the "gene" line for gene_type == "protein_coding" only -- creates 400 bins around the TSS +- creates 400 bins around the TSS - start if on + strand - end if on - strand - each bin is 10 bp width so total -2k through +2k rgeion is covered in the 400 bins diff --git a/resources/tssBed/create_per_gene_TSS_bins.py b/resources/tssBed/create_per_gene_TSS_bins.py index 661f7ff..87aa82f 100755 --- a/resources/tssBed/create_per_gene_TSS_bins.py +++ b/resources/tssBed/create_per_gene_TSS_bins.py @@ -1,88 +1,99 @@ #!/usr/bin/env python3 -import pysam,sys,os,numpy,subprocess +import pysam, sys, os, numpy, subprocess import argparse -import tempfile,shutil +import tempfile, shutil import gzip -def _get_gene_metadata(l,what): - for i,j in enumerate(l): - if j == what: - return l[i+1].replace(';','').replace('"','') - else: - return '' + +def _get_gene_metadata(l, what): + for i, j in enumerate(l): + if j == what: + return l[i + 1].replace(";", "").replace('"', "") + else: + return "" + def get_gene_metadata(l): - metadata_str_list = l[8].strip().split() - geneID = _get_gene_metadata(metadata_str_list,'gene_id') - geneName = _get_gene_metadata(metadata_str_list,'gene_name') - geneType = '' - geneType = _get_gene_metadata(metadata_str_list,'gene_type') - if geneType == '': - geneType = _get_gene_metadata(metadata_str_list,'gene_biotype') - chrom = l[0] - strand = l[6] - start = int(l[3])-1 - end = int(l[4])-1 - if strand == '-': - tmp = start - start = end - end = tmp - score = '.' - return geneID, geneName, geneType, chrom, start, strand + metadata_str_list = l[8].strip().split() + geneID = _get_gene_metadata(metadata_str_list, "gene_id") + geneName = _get_gene_metadata(metadata_str_list, "gene_name") + geneType = "" + geneType = _get_gene_metadata(metadata_str_list, "gene_type") + if geneType == "": + geneType = _get_gene_metadata(metadata_str_list, "gene_biotype") + chrom = l[0] + strand = l[6] + start = int(l[3]) - 1 + end = int(l[4]) - 1 + if strand == "-": + tmp = start + start = end + end = tmp + score = "." + return geneID, geneName, geneType, chrom, start, strand -parser = argparse.ArgumentParser(description='Create windows around TSS for each gene in GTF file') -#parser.add_argument('--gtf',required=True,help='GTF',type=argparse.FileType('r')) -parser.add_argument('--gtf',required=True,help='GTF file path') -parser.add_argument('--out',required=True,help='Output tar.gz file') +parser = argparse.ArgumentParser( + description="Create windows around TSS for each gene in GTF file" +) +# parser.add_argument('--gtf',required=True,help='GTF',type=argparse.FileType('r')) +parser.add_argument("--gtf", required=True, help="GTF file path") +parser.add_argument("--out", required=True, help="Output tar.gz file") args = parser.parse_args() -gtfFile = open(args.gtf,'r') +gtfFile = open(args.gtf, "r") genes_dict = dict() gtfLines = gtfFile.readlines() gtfFile.close() for l in gtfLines: - l = l.strip() - if not l.startswith('#'): - l = l.split("\t") - if l[2] == "gene": - geneID, geneName, geneType, chrom, start, strand = get_gene_metadata(l) - if geneName == '': - geneName = geneID - if geneType == "protein_coding": - genes_dict[geneID] = dict() - genes_dict[geneID]['geneName'] = geneName - genes_dict[geneID]['start'] = start - genes_dict[geneID]['strand'] = strand - genes_dict[geneID]['chrom'] = chrom + l = l.strip() + if not l.startswith("#"): + l = l.split("\t") + if l[2] == "gene": + geneID, geneName, geneType, chrom, start, strand = get_gene_metadata(l) + if geneName == "": + geneName = geneID + if geneType == "protein_coding": + genes_dict[geneID] = dict() + genes_dict[geneID]["geneName"] = geneName + genes_dict[geneID]["start"] = start + genes_dict[geneID]["strand"] = strand + genes_dict[geneID]["chrom"] = chrom tmp_folder = tempfile.mkdtemp() -for geneID,metaData in genes_dict.items(): - outBED = os.path.join(str(tmp_folder),geneID+'.bed') - outBuffer = open(outBED,'w') - if metaData['strand']=='-': - r = range(400,0,-1) - else: - r = range(1,401,1) - newstart = metaData['start'] - 2000 - for i,j in enumerate(r): - s = newstart + i * 10 - e = s + 10 - regionName = geneID + "|" + metaData['geneName'] + "|" + str(j) - outList = [metaData['chrom'],str(s),str(e),regionName,'.',metaData['strand']] - outStr = "\t".join(outList) - outBuffer.write(outStr+'\n') - outBuffer.close() +for geneID, metaData in genes_dict.items(): + outBED = os.path.join(str(tmp_folder), geneID + ".bed") + outBuffer = open(outBED, "w") + if metaData["strand"] == "-": + r = range(400, 0, -1) + else: + r = range(1, 401, 1) + newstart = metaData["start"] - 2000 + for i, j in enumerate(r): + s = newstart + i * 10 + e = s + 10 + regionName = geneID + "|" + metaData["geneName"] + "|" + str(j) + outList = [ + metaData["chrom"], + str(s), + str(e), + regionName, + ".", + metaData["strand"], + ] + outStr = "\t".join(outList) + outBuffer.write(outStr + "\n") + outBuffer.close() wd = os.getcwd() os.chdir(tmp_folder) -cmd = 'tar czvf '+args.out+' *.bed' +cmd = "tar czvf " + args.out + " *.bed" result = subprocess.run(cmd, shell=True, capture_output=True, text=True) -shutil.move(os.path.join(tmp_folder,args.out),os.path.join(wd,args.out)) +shutil.move(os.path.join(tmp_folder, args.out), os.path.join(wd, args.out)) shutil.rmtree(tmp_folder) print(tmp_folder) -#chrX HAVANA gene 100627109 100639991 . - . gene_id "ENSG00000000003.14"; gene_type "protein_coding"; gene_name "TSPAN6"; level 2; havana_gene "OTTHUMG00000022002.1"; -#chrX 100625108 100625118 ENSG00000000003.14|TSPAN6|400 . - +# chrX HAVANA gene 100627109 100639991 . - . gene_id "ENSG00000000003.14"; gene_type "protein_coding"; gene_name "TSPAN6"; level 2; havana_gene "OTTHUMG00000022002.1"; +# chrX 100625108 100625118 ENSG00000000003.14|TSPAN6|400 . - diff --git a/resources/tssBed/create_ref_files.sh b/resources/tssBed/create_ref_files.sh index 46f8abd..3cf1b1b 100755 --- a/resources/tssBed/create_ref_files.sh +++ b/resources/tssBed/create_ref_files.sh @@ -1,7 +1,7 @@ #!/bin/bash -python3 create_per_gene_TSS_bins.py --gtf /data/CCBR_Pipeliner/db/PipeDB/Indices/GTFs/hg19/gencode.v42lift37.basic.annotation.gtf --out hg19_v42_tssbeds.tar.gz -python3 create_per_gene_TSS_bins.py --gtf /data/CCBR_Pipeliner/db/PipeDB/Indices/GTFs/hg38/gencode.v42.primary_assembly.annotation.gtf --out hg38_v42_tssbed.tar.gz -python3 create_per_gene_TSS_bins.py --gtf /data/CCBR_Pipeliner/db/PipeDB/Indices/GTFs/mm10/gencode.vM25.annotation.gtf --out mm10_M25_tssbeds.tar.gz +python3 create_per_gene_TSS_bins.py --gtf /data/CCBR_Pipeliner/db/PipeDB/Indices/GTFs/hg19/gencode.v42lift37.basic.annotation.gtf --out hg19_v42_tssbeds.tar.gz +python3 create_per_gene_TSS_bins.py --gtf /data/CCBR_Pipeliner/db/PipeDB/Indices/GTFs/hg38/gencode.v42.primary_assembly.annotation.gtf --out hg38_v42_tssbed.tar.gz +python3 create_per_gene_TSS_bins.py --gtf /data/CCBR_Pipeliner/db/PipeDB/Indices/GTFs/mm10/gencode.vM25.annotation.gtf --out mm10_M25_tssbeds.tar.gz python3 create_per_gene_TSS_bins.py --gtf /data/CCBR_Pipeliner/db/PipeDB/Indices/Mmul_8.0.1_basic/genes.gtf --out mmul_tssbeds.tar.gz python3 create_per_gene_TSS_bins.py --gtf /data/CCBR_Pipeliner/db/PipeDB/Indices/mmul10/genes.gtf --out mmul10_v108_tssbeds.tar.gz python3 create_per_gene_TSS_bins.py --gtf /data/CCBR_Pipeliner/db/PipeDB/Indices/bosTau9/genes.gtf --out bosTau9_v108_tssbeds.tar.gz diff --git a/workflow/envs/dummy_env.yaml b/workflow/envs/dummy_env.yaml index 8b9de65..8724513 100644 --- a/workflow/envs/dummy_env.yaml +++ b/workflow/envs/dummy_env.yaml @@ -1,3 +1,3 @@ # conda environments needed for the workflow should be stored here. # An alternative to conda environments, is containers (singularity or docker) -# Check https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html?highlight=conda#containerization-of-conda-based-workflows for details. \ No newline at end of file +# Check https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html?highlight=conda#containerization-of-conda-based-workflows for details. diff --git a/workflow/scripts/DESeq2.Rmd b/workflow/scripts/DESeq2.Rmd index 99d10c3..787e804 100755 --- a/workflow/scripts/DESeq2.Rmd +++ b/workflow/scripts/DESeq2.Rmd @@ -139,7 +139,7 @@ pander(sampleinfo,style="rmarkdown") dds1 <- DESeqDataSetFromMatrix(countData = as.matrix(countmatrix), colData = sampleinfo[,c("replicateName","sampleName")], design = ~ 1) -dds1 <- DESeq(dds1) +dds1 <- DESeq(dds1) if (nrow(dds1)<2000) { rld1 <- varianceStabilizingTransformation(dds1, blind=TRUE) } else { @@ -252,7 +252,7 @@ EnhancedVolcano::EnhancedVolcano(resdf_w_anno, ## Open ROIs ```{r updown,include=TRUE,echo=FALSE,cache=FALSE,warning=FALSE} -# sometimes FC and FDR are too strict and up_roi/down_roi can be zero +# sometimes FC and FDR are too strict and up_roi/down_roi can be zero # to work with that scenario without errors: x=as.data.frame(rbind(table(up_roi$shortAnno),table(down_roi$shortAnno))) if (nrow(x) == 2) { @@ -269,4 +269,3 @@ if (nrow(x) == 2) { } # heatmap_matrix=assay(rld2) ``` - diff --git a/workflow/scripts/_ccbr_counts2density.py b/workflow/scripts/_ccbr_counts2density.py index 82c9bc8..e5fcb26 100644 --- a/workflow/scripts/_ccbr_counts2density.py +++ b/workflow/scripts/_ccbr_counts2density.py @@ -1,24 +1,25 @@ -# this script is specific to ccbr_tagAlign2TSS.bash +# this script is specific to ccbr_tagAlign2TSS.bash import sys -n2bin=dict() -density=dict() -for i,b in enumerate(range(-1995,2000,10)): - n2bin[i+1]=b - density[b]=0.0 -#for l in map(lambda x:x.strip().split("\t"),open(sys.argv[1]).readlines()): + +n2bin = dict() +density = dict() +for i, b in enumerate(range(-1995, 2000, 10)): + n2bin[i + 1] = b + density[b] = 0.0 +# for l in map(lambda x:x.strip().split("\t"),open(sys.argv[1]).readlines()): for l in sys.stdin: - l=l.strip().split("\t") - density[n2bin[int(l[0])]]=int(l[1]) -flanksum=0 -for i in range(1,11): - flanksum+=density[n2bin[i]] -for i in range(391,401): - flanksum+=density[n2bin[i]] -flankavg=flanksum*1.0/20 -alldensities=list() -for i in range(1,401): - density[n2bin[i]]=density[n2bin[i]]*1.0/flankavg - alldensities.append(density[n2bin[i]]) - print("%d\t%.4f"%(n2bin[i],density[n2bin[i]])) -print("# TSS enrichment: %.4f"%(max(alldensities))) \ No newline at end of file + l = l.strip().split("\t") + density[n2bin[int(l[0])]] = int(l[1]) +flanksum = 0 +for i in range(1, 11): + flanksum += density[n2bin[i]] +for i in range(391, 401): + flanksum += density[n2bin[i]] +flankavg = flanksum * 1.0 / 20 +alldensities = list() +for i in range(1, 401): + density[n2bin[i]] = density[n2bin[i]] * 1.0 / flankavg + alldensities.append(density[n2bin[i]]) + print("%d\t%.4f" % (n2bin[i], density[n2bin[i]])) +print("# TSS enrichment: %.4f" % (max(alldensities))) diff --git a/workflow/scripts/_featureCounts_header_fix.py b/workflow/scripts/_featureCounts_header_fix.py index d8dc7b6..1a9bcbf 100644 --- a/workflow/scripts/_featureCounts_header_fix.py +++ b/workflow/scripts/_featureCounts_header_fix.py @@ -1,22 +1,23 @@ -import os,sys +import os, sys -i=0 +i = 0 for l in sys.stdin: - i+=1 - if i==1: continue - if i==2: - outlist=[] - l=l.strip().split("\t") - for s in l: - if s.startswith("/"): - x=os.path.basename(s) - x=x.replace(".genrich.tn5nicks.bam","") - x=x.replace(".macs2.tn5nicks.bam","") - x=x.replace(".tn5sites.bam","") - x=x.replace(".reads.bam","") - outlist.append(x) - else: - outlist.append(s) - print("\t".join(outlist)) - else: - print(l.strip()) + i += 1 + if i == 1: + continue + if i == 2: + outlist = [] + l = l.strip().split("\t") + for s in l: + if s.startswith("/"): + x = os.path.basename(s) + x = x.replace(".genrich.tn5nicks.bam", "") + x = x.replace(".macs2.tn5nicks.bam", "") + x = x.replace(".tn5sites.bam", "") + x = x.replace(".reads.bam", "") + outlist.append(x) + else: + outlist.append(s) + print("\t".join(outlist)) + else: + print(l.strip()) diff --git a/workflow/scripts/_nrf.py b/workflow/scripts/_nrf.py index 812cab0..0cd9178 100644 --- a/workflow/scripts/_nrf.py +++ b/workflow/scripts/_nrf.py @@ -1,17 +1,18 @@ from __future__ import print_function import sys -preseq_log=sys.argv[1] -with open(preseq_log, 'r') as fp: - for line in fp: - if line.startswith('TOTAL READS'): - tot_reads = float(line.strip().split("= ")[1]) - elif line.startswith('DISTINCT READS'): - distinct_reads = float(line.strip().split('= ')[1]) - elif line.startswith('1\t'): - one_pair = float(line.strip().split()[1]) - elif line.startswith('2\t'): - two_pair = float(line.strip().split()[1]) -NRF = distinct_reads/tot_reads -PBC1 = one_pair/distinct_reads -PBC2 = one_pair/two_pair -print("NRF:%.3f\nPBC1:%.3f\nPBC2:%.3f"%(NRF,PBC1,PBC2)) + +preseq_log = sys.argv[1] +with open(preseq_log, "r") as fp: + for line in fp: + if line.startswith("TOTAL READS"): + tot_reads = float(line.strip().split("= ")[1]) + elif line.startswith("DISTINCT READS"): + distinct_reads = float(line.strip().split("= ")[1]) + elif line.startswith("1\t"): + one_pair = float(line.strip().split()[1]) + elif line.startswith("2\t"): + two_pair = float(line.strip().split()[1]) +NRF = distinct_reads / tot_reads +PBC1 = one_pair / distinct_reads +PBC2 = one_pair / two_pair +print("NRF:%.3f\nPBC1:%.3f\nPBC2:%.3f" % (NRF, PBC1, PBC2)) diff --git a/workflow/scripts/_print_replicate_scaling_factor.py b/workflow/scripts/_print_replicate_scaling_factor.py index 25b93cd..aea1132 100644 --- a/workflow/scripts/_print_replicate_scaling_factor.py +++ b/workflow/scripts/_print_replicate_scaling_factor.py @@ -1,5 +1,6 @@ import sys + def get_scaling_factor(sample_name): for line in sys.stdin: parts = line.strip().split("\t") @@ -8,13 +9,17 @@ def get_scaling_factor(sample_name): if parts[0] == sample_name: print(parts[2]) # Print only the scaling factor return - + print(f"Sample {sample_name} not found.", file=sys.stderr) sys.exit(1) + if __name__ == "__main__": if len(sys.argv) != 2: - print("Usage: cat scaling_factors.tsv | python _print_replicate_scaling_factor.py ", file=sys.stderr) + print( + "Usage: cat scaling_factors.tsv | python _print_replicate_scaling_factor.py ", + file=sys.stderr, + ) sys.exit(1) sample_name = sys.argv[1] diff --git a/workflow/scripts/_qc_annotated2peakwidthdensity.py b/workflow/scripts/_qc_annotated2peakwidthdensity.py index bc3dc13..c41cb52 100644 --- a/workflow/scripts/_qc_annotated2peakwidthdensity.py +++ b/workflow/scripts/_qc_annotated2peakwidthdensity.py @@ -1,12 +1,13 @@ import sys import pandas import numpy -x=pandas.read_csv(sys.argv[1],header=0,sep="\t",usecols=["width"]) + +x = pandas.read_csv(sys.argv[1], header=0, sep="\t", usecols=["width"]) x.dropna(inplace=True) -a,b=numpy.histogram(x['width'],bins=400,range=(0,20000)) -c=[] -for i in range(len(b)-1): - c.append(0.5*(b[i]+b[i+1])) -d=a*100.0/sum(a) +a, b = numpy.histogram(x["width"], bins=400, range=(0, 20000)) +c = [] +for i in range(len(b) - 1): + c.append(0.5 * (b[i] + b[i + 1])) +d = a * 100.0 / sum(a) for i in range(len(d)): - print("%d\t%.5f"%(c[i],d[i])) + print("%d\t%.5f" % (c[i], d[i])) diff --git a/workflow/scripts/_qc_create_frip_stats_table.bash b/workflow/scripts/_qc_create_frip_stats_table.bash index 797c087..4319650 100644 --- a/workflow/scripts/_qc_create_frip_stats_table.bash +++ b/workflow/scripts/_qc_create_frip_stats_table.bash @@ -38,11 +38,9 @@ for r in $replicates;do rm -f ${replicateName}.part* done > $RANDOMTXT -grep -m1 "replicateName" $RANDOMTXT > ${RANDOMTXT}.header +grep -m1 "replicateName" $RANDOMTXT > ${RANDOMTXT}.header grep -v "replicateName" $RANDOMTXT | sort > ${RANDOMTXT}.body cat ${RANDOMTXT}.header ${RANDOMTXT}.body rm -f ${RANDOMTXT}* - - diff --git a/workflow/scripts/_qc_make_qcstats_table.py b/workflow/scripts/_qc_make_qcstats_table.py index b24143b..f8374db 100644 --- a/workflow/scripts/_qc_make_qcstats_table.py +++ b/workflow/scripts/_qc_make_qcstats_table.py @@ -4,58 +4,63 @@ import pandas from functools import reduce -dfs_to_join=[] +dfs_to_join = [] try: - x=pandas.read_csv("Nreads_mqc.csv",sep="\t") - nreads=x.set_index("replicateName") - cols=list(nreads) - nreads['Nreads']=nreads.loc[:,cols].sum(axis=1) + x = pandas.read_csv("Nreads_mqc.csv", sep="\t") + nreads = x.set_index("replicateName") + cols = list(nreads) + nreads["Nreads"] = nreads.loc[:, cols].sum(axis=1) for i in cols: - j=i+"_perc" - nreads[j]=nreads[i]*100.0/nreads['Nreads'] + j = i + "_perc" + nreads[j] = nreads[i] * 100.0 / nreads["Nreads"] # nreads.head() - cols2=['Nreads'] + cols2 = ["Nreads"] for i in cols: cols2.append(i) - cols2.append(i+"_perc") - nreads=nreads.loc[:,cols2] + cols2.append(i + "_perc") + nreads = nreads.loc[:, cols2] # nreads.head() dfs_to_join.append(nreads) except FileNotFoundError: print("Nreads_mqc.csv not found!") try: - x=pandas.read_csv("data.tss_nicking_sites.txt",header=None,sep="\t",names=["replicateName","N_TSS_gt20knicks","TSSscore"]) - tss_stats=x.set_index("replicateName") + x = pandas.read_csv( + "data.tss_nicking_sites.txt", + header=None, + sep="\t", + names=["replicateName", "N_TSS_gt20knicks", "TSSscore"], + ) + tss_stats = x.set_index("replicateName") dfs_to_join.append(tss_stats) except FileNotFoundError: print("data.tss_nicking_sites.txt not found!") try: - x=pandas.read_csv("NRF_stats.tsv",sep="\t") - nrf=x.set_index("replicateName") + x = pandas.read_csv("NRF_stats.tsv", sep="\t") + nrf = x.set_index("replicateName") dfs_to_join.append(nrf) except FileNotFoundError: print("NRF_stats.tsv not found!") try: - x=pandas.read_csv("FRiP_stats.tsv",sep="\t") - frip=x.set_index("replicateName") + x = pandas.read_csv("FRiP_stats.tsv", sep="\t") + frip = x.set_index("replicateName") dfs_to_join.append(frip) except FileNotFoundError: print("FRiP_stats.tsv not found!") try: - x=pandas.read_csv("FLD_stats_peaks.tsv",sep="\t") - fld1=x.set_index("replicateName") + x = pandas.read_csv("FLD_stats_peaks.tsv", sep="\t") + fld1 = x.set_index("replicateName") dfs_to_join.append(fld1) except FileNotFoundError: print("FLD_stats_peaks.tsv not found!") try: - x=pandas.read_csv("FLD_stats_fractions_ratios.tsv",sep="\t") - fld2=x.set_index("replicateName") + x = pandas.read_csv("FLD_stats_fractions_ratios.tsv", sep="\t") + fld2 = x.set_index("replicateName") dfs_to_join.append(fld2) except FileNotFoundError: print("FLD_stats_fractions_ratios.tsv not found!") @@ -71,19 +76,19 @@ # allstats=allstats.join(fld2.set_index("replicateName")) try: - x=pandas.read_csv("MACS2_Peak_Annotations_mqc.csv",sep="\t") - macs=x.set_index("replicateName") - macs.columns=list(map(lambda z:"macs2_"+z,list(macs))) - cols=list(macs) - macs['macs2_Npeaks']=macs.loc[:,cols].sum(axis=1) + x = pandas.read_csv("MACS2_Peak_Annotations_mqc.csv", sep="\t") + macs = x.set_index("replicateName") + macs.columns = list(map(lambda z: "macs2_" + z, list(macs))) + cols = list(macs) + macs["macs2_Npeaks"] = macs.loc[:, cols].sum(axis=1) for i in cols: - j=i+"_perc" - macs[j]=macs[i]*100.0/macs['macs2_Npeaks'] - cols2=['macs2_Npeaks'] + j = i + "_perc" + macs[j] = macs[i] * 100.0 / macs["macs2_Npeaks"] + cols2 = ["macs2_Npeaks"] for i in cols: cols2.append(i) - cols2.append(i+"_perc") - macs=macs.loc[:,cols2] + cols2.append(i + "_perc") + macs = macs.loc[:, cols2] dfs_to_join.append(macs) except FileNotFoundError: print("MACS2_Peak_Annotations_mqc.csv not found!") @@ -92,28 +97,30 @@ # allstats=allstats.join(x) try: - x=pandas.read_csv("Genrich_Peak_Annotations_mqc.csv",sep="\t") - genrich=x.set_index("replicateName") - genrich.columns=list(map(lambda z:"genrich_"+z,list(genrich))) - cols=list(genrich) - genrich['genrich_Npeaks']=genrich.loc[:,cols].sum(axis=1) + x = pandas.read_csv("Genrich_Peak_Annotations_mqc.csv", sep="\t") + genrich = x.set_index("replicateName") + genrich.columns = list(map(lambda z: "genrich_" + z, list(genrich))) + cols = list(genrich) + genrich["genrich_Npeaks"] = genrich.loc[:, cols].sum(axis=1) for i in cols: - j=i+"_perc" - genrich[j]=genrich[i]*100.0/genrich['genrich_Npeaks'] - cols2=['genrich_Npeaks'] + j = i + "_perc" + genrich[j] = genrich[i] * 100.0 / genrich["genrich_Npeaks"] + cols2 = ["genrich_Npeaks"] for i in cols: cols2.append(i) - cols2.append(i+"_perc") - genrich=genrich.loc[:,cols2] + cols2.append(i + "_perc") + genrich = genrich.loc[:, cols2] dfs_to_join.append(genrich) except FileNotFoundError: print("Genrich_Peak_Annotations_mqc.csv not found!") # allstats=allstats.join(x) -if len(dfs_to_join)>0: - allstats=reduce(lambda a,b:pandas.merge(a,b,how="outer",on="replicateName"),dfs_to_join) - allstats=allstats.round(3) - allstats.to_csv("QCStats.tsv",sep="\t") +if len(dfs_to_join) > 0: + allstats = reduce( + lambda a, b: pandas.merge(a, b, how="outer", on="replicateName"), dfs_to_join + ) + allstats = allstats.round(3) + allstats.to_csv("QCStats.tsv", sep="\t") else: print("Nothing to add to QCStats.tsv!!") diff --git a/workflow/scripts/_scale_counts.py b/workflow/scripts/_scale_counts.py index f3f74e0..8e9e439 100644 --- a/workflow/scripts/_scale_counts.py +++ b/workflow/scripts/_scale_counts.py @@ -1,21 +1,24 @@ import pandas as pd import argparse + def main(scaling_factors_file, counts_file, output_file): # Read scaling factors scaling_factors_df = pd.read_csv( - scaling_factors_file, - sep='\t', - header=None, - names=['replicateName', 'replicateReads', 'scalingFactor'] + scaling_factors_file, + sep="\t", + header=None, + names=["replicateName", "replicateReads", "scalingFactor"], + ) + scaling_factors = dict( + zip(scaling_factors_df["replicateName"], scaling_factors_df["scalingFactor"]) ) - scaling_factors = dict(zip(scaling_factors_df['replicateName'], scaling_factors_df['scalingFactor'])) # Read counts file - counts_df = pd.read_csv(counts_file, sep='\t') + counts_df = pd.read_csv(counts_file, sep="\t") # List of columns to keep as-is - fixed_columns = ['Geneid', 'Chr', 'Start', 'End', 'Strand', 'Length'] + fixed_columns = ["Geneid", "Chr", "Start", "End", "Strand", "Length"] # Apply scaling factors to replicate columns for col in counts_df.columns: @@ -23,14 +26,23 @@ def main(scaling_factors_file, counts_file, output_file): counts_df[col] = (counts_df[col] * scaling_factors[col]).round().astype(int) # Write the output to a tab-delimited file - counts_df.to_csv(output_file, sep='\t', index=False) + counts_df.to_csv(output_file, sep="\t", index=False) print("Scaled counts written to {}".format(output_file)) + if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Scale counts using scaling factors.') - parser.add_argument('--scaling_factors', required=True, help='Path to the scaling factors TSV file') - parser.add_argument('--counts_file', required=True, help='Path to the counts TSV file') - parser.add_argument('--output_file', required=True, help='Path to save the scaled counts output TSV file') - + parser = argparse.ArgumentParser(description="Scale counts using scaling factors.") + parser.add_argument( + "--scaling_factors", required=True, help="Path to the scaling factors TSV file" + ) + parser.add_argument( + "--counts_file", required=True, help="Path to the counts TSV file" + ) + parser.add_argument( + "--output_file", + required=True, + help="Path to save the scaled counts output TSV file", + ) + args = parser.parse_args() - main(args.scaling_factors, args.counts_file, args.output_file) \ No newline at end of file + main(args.scaling_factors, args.counts_file, args.output_file) diff --git a/workflow/scripts/_transpose.sh b/workflow/scripts/_transpose.sh index 5cfbd84..63e0184 100644 --- a/workflow/scripts/_transpose.sh +++ b/workflow/scripts/_transpose.sh @@ -1,11 +1,11 @@ awk ' -{ +{ for (i=1; i<=NF; i++) { a[NR,i] = $i } } NF>p { p = NF } -END { +END { for(j=1; j<=p; j++) { str=a[1,j] for(i=2; i<=NR; i++){ diff --git a/workflow/scripts/atac_assign_multimappers.py b/workflow/scripts/atac_assign_multimappers.py index 3d21b0e..7f9f601 100755 --- a/workflow/scripts/atac_assign_multimappers.py +++ b/workflow/scripts/atac_assign_multimappers.py @@ -7,13 +7,21 @@ import random import argparse + def parse_args(): - ''' + """ Gives options - ''' - parser = argparse.ArgumentParser(description='Saves reads below a alignment threshold and discards all others') - parser.add_argument('-k', help='Alignment number cutoff') - parser.add_argument('--paired-end', dest='paired_ended', action='store_true', help='Data is paired-end') + """ + parser = argparse.ArgumentParser( + description="Saves reads below a alignment threshold and discards all others" + ) + parser.add_argument("-k", help="Alignment number cutoff") + parser.add_argument( + "--paired-end", + dest="paired_ended", + action="store_true", + help="Data is paired-end", + ) args = parser.parse_args() alignment_cutoff = int(args.k) paired_ended = args.paired_ended @@ -22,26 +30,25 @@ def parse_args(): if __name__ == "__main__": - ''' + """ Runs the filtering step of choosing multimapped reads - ''' + """ [alignment_cutoff, paired_ended] = parse_args() if paired_ended: alignment_cutoff = int(alignment_cutoff) * 2 - # Store each line in sam file as a list of reads, - # where each read is a list of elements to easily + # Store each line in sam file as a list of reads, + # where each read is a list of elements to easily # modify or grab things - current_reads = [] - current_qname = '' + current_reads = [] + current_qname = "" for line in sys.stdin: + read_elems = line.strip().split("\t") - read_elems = line.strip().split('\t') - - if read_elems[0].startswith('@'): + if read_elems[0].startswith("@"): sys.stdout.write(line) continue @@ -68,4 +75,3 @@ def parse_args(): # First read in file current_reads.append(line) current_qname = read_elems[0] - diff --git a/workflow/scripts/ccbr_annotate_bed.R b/workflow/scripts/ccbr_annotate_bed.R index 34d62ea..9c51755 100644 --- a/workflow/scripts/ccbr_annotate_bed.R +++ b/workflow/scripts/ccbr_annotate_bed.R @@ -18,8 +18,8 @@ suppressPackageStartupMessages(library("org.Bt.eg.db")) parser <- ArgumentParser() -# specify our desired options -# by default ArgumentParser will add an help option +# specify our desired options +# by default ArgumentParser will add an help option parser$add_argument("-b", "--bed", required=TRUE, dest="bed", help="narrowpeak file") @@ -34,7 +34,7 @@ parser$add_argument("-g", "--genome", required=TRUE, dest="genome", help="hg38/hg19/mm10/mm9/mmul10/bosTau9") # get command line options, if help option encountered print help and exit, -# otherwise if options not found on command line then set defaults, +# otherwise if options not found on command line then set defaults, args <- parser$parse_args() if (args$genome=="mm9" | args$genome=="mm10"){ @@ -76,11 +76,11 @@ pa=annotatePeak(peak = peaks, TxDb = tdb, level = "transcript", genomicAnnotationPriority = c("Promoter", "5UTR", "3UTR", "Exon", "Intron", "Downstream", "Intergenic"), - annoDb = adb, - sameStrand = FALSE, + annoDb = adb, + sameStrand = FALSE, ignoreOverlap = FALSE, - ignoreUpstream = FALSE, - ignoreDownstream = FALSE, + ignoreUpstream = FALSE, + ignoreDownstream = FALSE, overlap = "TSS") padf=as.data.frame(pa) @@ -124,13 +124,13 @@ colnames(merged)=c("#peakID", "SYMBOL", "GENENAME") -# merge annotation with narrowPeak file +# merge annotation with narrowPeak file write.table(merged,args$annotated,sep = "\t",quote = FALSE, row.names = FALSE) l=paste("# Median peak width : ",median(merged$width),sep="") write(l,args$annotated,append=TRUE) -# get promoter genes +# get promoter genes # ... all lines with annotation starting with "Promoter" promoters1=dplyr::filter(merged,grepl("Promoter",annotation)) # ... all lines with annotation is "5' UTR" @@ -165,5 +165,5 @@ for (ann in c("Exon","Intron")){ x=dplyr::filter(merged,grepl(ann,annotation)) w=median(x$width) l=paste(gsub(" ","",ann),nrow(x),w,sep="\t") - write(l,args$atypefreq,append=TRUE) + write(l,args$atypefreq,append=TRUE) } diff --git a/workflow/scripts/ccbr_annotate_peaks.R b/workflow/scripts/ccbr_annotate_peaks.R index a1c54b4..bf3994c 100644 --- a/workflow/scripts/ccbr_annotate_peaks.R +++ b/workflow/scripts/ccbr_annotate_peaks.R @@ -18,8 +18,8 @@ suppressPackageStartupMessages(library("org.Bt.eg.db")) parser <- ArgumentParser() -# specify our desired options -# by default ArgumentParser will add an help option +# specify our desired options +# by default ArgumentParser will add an help option parser$add_argument("-n", "--narrowpeak", required=TRUE, dest="narrowpeak", help="narrowpeak file") @@ -34,7 +34,7 @@ parser$add_argument("-g", "--genome", required=TRUE, dest="genome", help="hg38/hg19/mm10/mm9/mmul10/bosTau9") # get command line options, if help option encountered print help and exit, -# otherwise if options not found on command line then set defaults, +# otherwise if options not found on command line then set defaults, args <- parser$parse_args() if (args$genome=="mm9" | args$genome=="mm10"){ @@ -83,11 +83,11 @@ pa=annotatePeak(peak = peaks, TxDb = tdb, level = "transcript", genomicAnnotationPriority = c("Promoter", "5UTR", "3UTR", "Exon", "Intron", "Downstream", "Intergenic"), - annoDb = adb, - sameStrand = FALSE, + annoDb = adb, + sameStrand = FALSE, ignoreOverlap = FALSE, - ignoreUpstream = FALSE, - ignoreDownstream = FALSE, + ignoreUpstream = FALSE, + ignoreDownstream = FALSE, overlap = "TSS") padf=as.data.frame(pa) @@ -142,7 +142,7 @@ colnames(merged)=c("#peakID", "peak") -# merge annotation with narrowPeak file +# merge annotation with narrowPeak file merged=merged[order(-merged$qValue),] write.table(merged,args$annotated,sep = "\t",quote = FALSE, row.names = FALSE) l=paste("# Median peak width : ",median(merged$width),sep="") @@ -153,7 +153,7 @@ l=paste("# Median qValue : ",median(merged$qValue),sep="") write(l,args$annotated,append=TRUE) -# get promoter genes +# get promoter genes # ... all lines with annotation starting with "Promoter" promoters1=dplyr::filter(merged,grepl("Promoter",annotation)) # ... all lines with annotation is "5' UTR" @@ -197,5 +197,5 @@ for (ann in c("Exon","Intron")){ p=median(x$pValue) q=median(x$qValue) l=paste(gsub(" ","",ann),nrow(x),w,p,q,sep="\t") - write(l,args$atypefreq,append=TRUE) + write(l,args$atypefreq,append=TRUE) } diff --git a/workflow/scripts/ccbr_atac_bam2FLD.py b/workflow/scripts/ccbr_atac_bam2FLD.py index 92cc152..420eaeb 100644 --- a/workflow/scripts/ccbr_atac_bam2FLD.py +++ b/workflow/scripts/ccbr_atac_bam2FLD.py @@ -1,31 +1,38 @@ -import pysam,sys,os,numpy +import pysam, sys, os, numpy from scipy.signal import find_peaks_cwt import argparse + + def get_max_distance(l): - mi=min(l) - ma=max(l) - return ma-mi + mi = min(l) + ma = max(l) + return ma - mi + -def is_peak_in_range(peaks,a,b): - if len(numpy.where(numpy.logical_and(peaks>a,peaks= 1: +def is_peak_in_range(peaks, a, b): + if len(numpy.where(numpy.logical_and(peaks > a, peaks < b))[0]) >= 1: return "PRESENT" else: return "ABSENT" -def get_sum_in_range(freq,a,b): - sum=0 - for i in range(a,b,1): - sum+=freq[i] + +def get_sum_in_range(freq, a, b): + sum = 0 + for i in range(a, b, 1): + sum += freq[i] return sum -parser = argparse.ArgumentParser(description='calculate normalized fragment length distribution from PE bam file') -parser.add_argument('-i',dest='inBam',required=True,help='Input Bam File') -parser.add_argument('-o',dest='out',required=True,help='Output tab-delimited File') + +parser = argparse.ArgumentParser( + description="calculate normalized fragment length distribution from PE bam file" +) +parser.add_argument("-i", dest="inBam", required=True, help="Input Bam File") +parser.add_argument("-o", dest="out", required=True, help="Output tab-delimited File") args = parser.parse_args() -if not os.path.exists(args.inBam+".bai"): +if not os.path.exists(args.inBam + ".bai"): pysam.index(args.inBam) samfile = pysam.AlignmentFile(args.inBam, "rb") -refcoordsperread=dict() +refcoordsperread = dict() for read in samfile.fetch(): if read.is_unmapped: continue @@ -37,22 +44,22 @@ def get_sum_in_range(freq,a,b): continue if read.is_proper_pair: if not read.query_name in refcoordsperread: - refcoordsperread[read.query_name]=list() + refcoordsperread[read.query_name] = list() refcoordsperread[read.query_name].append(int(read.reference_start)) refcoordsperread[read.query_name].append(int(read.reference_end)) -fraglens=list() -for k,v in refcoordsperread.items(): - fraglens.append(get_max_distance(refcoordsperread[k])) -o=open(args.out,'w') -s=len(fraglens) -freqs=dict() -for i in range(0,max(fraglens)+1,1): - freqs[i]=0 -for i in range(min(fraglens),max(fraglens)+1,1): - c=fraglens.count(i) - freqs[i]=c +fraglens = list() +for k, v in refcoordsperread.items(): + fraglens.append(get_max_distance(refcoordsperread[k])) +o = open(args.out, "w") +s = len(fraglens) +freqs = dict() +for i in range(0, max(fraglens) + 1, 1): + freqs[i] = 0 +for i in range(min(fraglens), max(fraglens) + 1, 1): + c = fraglens.count(i) + freqs[i] = c if c != 0: - o.write("%d\t%.6f\n"%(i,c*1000.0/s)) + o.write("%d\t%.6f\n" % (i, c * 1000.0 / s)) # peaks=find_peaks_cwt(freqs.values(),numpy.array([25])) # freq.values() may contain a string of zeros which can lead to the following error: @@ -67,25 +74,25 @@ def get_sum_in_range(freq,a,b): # raise ValueError("volume and kernel should have the same " # ValueError: volume and kernel should have the same dimensionality # Here is a solution: -peaks=find_peaks_cwt(list(map(lambda x:x+1,freqs.values())),numpy.array([25])) +peaks = find_peaks_cwt(list(map(lambda x: x + 1, freqs.values())), numpy.array([25])) -nfr_peak=is_peak_in_range(peaks,20,90) -mono_nuc_peak=is_peak_in_range(peaks,120,250) -di_nuc_peak=is_peak_in_range(peaks,300,500) +nfr_peak = is_peak_in_range(peaks, 20, 90) +mono_nuc_peak = is_peak_in_range(peaks, 120, 250) +di_nuc_peak = is_peak_in_range(peaks, 300, 500) -nfr_reads=get_sum_in_range(freqs,0,150) -mono_nuc_reads=get_sum_in_range(freqs,150,300) -non_nfr_reads=sum(freqs.values())-nfr_reads -nfr_to_mono_nuc_ratio=nfr_reads*1.0/mono_nuc_reads -nfr_to_other_ratio=nfr_reads*1.0/non_nfr_reads +nfr_reads = get_sum_in_range(freqs, 0, 150) +mono_nuc_reads = get_sum_in_range(freqs, 150, 300) +non_nfr_reads = sum(freqs.values()) - nfr_reads +nfr_to_mono_nuc_ratio = nfr_reads * 1.0 / mono_nuc_reads +nfr_to_other_ratio = nfr_reads * 1.0 / non_nfr_reads -o.write("# PEAKS : %s\n"%(peaks)) -o.write("# NUCLEOSOME_FREE_PEAK_PRESENT : %s\n"%(nfr_peak)) -o.write("# MONONUCLEOSOME_PEAK_PRESENT : %s\n"%(mono_nuc_peak)) -o.write("# DINUCLEOSOME_PEAK_PRESENT : %s\n"%(di_nuc_peak)) -o.write("# NUCLEOSOME_FREE_READ_FRACTION : %0.3f\n"%(nfr_reads*1.0/s)) -o.write("# MONONUCLEOSOME_READ_FRACTION : %0.3f\n"%(mono_nuc_reads*1.0/s)) -o.write("# NFR_TO_MONONUCLEOSOME_READS_RATIO : %0.3f\n"%(nfr_to_mono_nuc_ratio)) -o.write("# NFR_TO_NON_NFR_READS_RATIO : %0.3f\n"%(nfr_to_other_ratio)) +o.write("# PEAKS : %s\n" % (peaks)) +o.write("# NUCLEOSOME_FREE_PEAK_PRESENT : %s\n" % (nfr_peak)) +o.write("# MONONUCLEOSOME_PEAK_PRESENT : %s\n" % (mono_nuc_peak)) +o.write("# DINUCLEOSOME_PEAK_PRESENT : %s\n" % (di_nuc_peak)) +o.write("# NUCLEOSOME_FREE_READ_FRACTION : %0.3f\n" % (nfr_reads * 1.0 / s)) +o.write("# MONONUCLEOSOME_READ_FRACTION : %0.3f\n" % (mono_nuc_reads * 1.0 / s)) +o.write("# NFR_TO_MONONUCLEOSOME_READS_RATIO : %0.3f\n" % (nfr_to_mono_nuc_ratio)) +o.write("# NFR_TO_NON_NFR_READS_RATIO : %0.3f\n" % (nfr_to_other_ratio)) -o.close() \ No newline at end of file +o.close() diff --git a/workflow/scripts/ccbr_atac_bam2tn5bed.py b/workflow/scripts/ccbr_atac_bam2tn5bed.py index 6485610..047c412 100644 --- a/workflow/scripts/ccbr_atac_bam2tn5bed.py +++ b/workflow/scripts/ccbr_atac_bam2tn5bed.py @@ -1,11 +1,14 @@ import pysam import argparse + def extract_fragments(input_bam, output_tn5, output_reads, threads): # Open the input BAM file with multiple threads bamfile = pysam.AlignmentFile(input_bam, "rb", threads=threads) - with open(output_tn5, "w") as tn5bedoutfile, open(output_reads, "w") as readsoutfile: + with open(output_tn5, "w") as tn5bedoutfile, open( + output_reads, "w" + ) as readsoutfile: prev_read = None # Store Read1 for read in bamfile.fetch(until_eof=True): # Iterate without index lookup @@ -14,7 +17,9 @@ def extract_fragments(input_bam, output_tn5, output_reads, threads): if prev_read and prev_read.query_name == read.query_name: # Assign Read1 and Read2 correctly - read1, read2 = (prev_read, read) if prev_read.is_read1 else (read, prev_read) + read1, read2 = ( + (prev_read, read) if prev_read.is_read1 else (read, prev_read) + ) # Ensure both reads are on the same chromosome if read1.reference_id != read2.reference_id: @@ -34,8 +39,12 @@ def extract_fragments(input_bam, output_tn5, output_reads, threads): read2_strand = "+" if not read2.is_reverse else "-" read_name = read1.query_name - tn5bedoutfile.write(f"{read1.reference_name}\t{start}\t{start+1}\t{read_name}_+\t.\t+\n") - tn5bedoutfile.write(f"{read1.reference_name}\t{end}\t{end+1}\t{read_name}_-\t.\t-\n") + tn5bedoutfile.write( + f"{read1.reference_name}\t{start}\t{start+1}\t{read_name}_+\t.\t+\n" + ) + tn5bedoutfile.write( + f"{read1.reference_name}\t{end}\t{end+1}\t{read_name}_-\t.\t-\n" + ) # Adjust Read1 and Read2 positions for output read1_start, read1_end = read1.reference_start, read1.reference_end @@ -52,8 +61,12 @@ def extract_fragments(input_bam, output_tn5, output_reads, threads): read2_end -= 5 # Write Read1 and Read2 positions to file - readsoutfile.write(f"{read1.reference_name}\t{read1_start}\t{read1_end}\t{read_name}_{read1_strand}\t.\t{read1_strand}\n") - readsoutfile.write(f"{read2.reference_name}\t{read2_start}\t{read2_end}\t{read_name}_{read2_strand}\t.\t{read2_strand}\n") + readsoutfile.write( + f"{read1.reference_name}\t{read1_start}\t{read1_end}\t{read_name}_{read1_strand}\t.\t{read1_strand}\n" + ) + readsoutfile.write( + f"{read2.reference_name}\t{read2_start}\t{read2_end}\t{read_name}_{read2_strand}\t.\t{read2_strand}\n" + ) prev_read = None # Reset for next read pair else: @@ -61,11 +74,20 @@ def extract_fragments(input_bam, output_tn5, output_reads, threads): bamfile.close() + if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Extract Tn5 fragment sites and reads from a BAM file.") - parser.add_argument("-i", "--bam", required=True, help="Input BAM file (query-name sorted)") - parser.add_argument("-t", "--tn5bed", required=True, help="Output Tn5 insertion sites BED file") + parser = argparse.ArgumentParser( + description="Extract Tn5 fragment sites and reads from a BAM file." + ) + parser.add_argument( + "-i", "--bam", required=True, help="Input BAM file (query-name sorted)" + ) + parser.add_argument( + "-t", "--tn5bed", required=True, help="Output Tn5 insertion sites BED file" + ) parser.add_argument("-b", "--readsbed", required=True, help="Output reads BED file") - parser.add_argument("-n", "--ncpus", required=False, default=2, help="Number of CPUs to use") + parser.add_argument( + "-n", "--ncpus", required=False, default=2, help="Number of CPUs to use" + ) args = parser.parse_args() - extract_fragments(args.bam, args.tn5bed, args.readsbed,args.ncpus) + extract_fragments(args.bam, args.tn5bed, args.readsbed, args.ncpus) diff --git a/workflow/scripts/ccbr_atac_macs2_peak_calling.bash b/workflow/scripts/ccbr_atac_macs2_peak_calling.bash index 52f71b2..12c28ff 100644 --- a/workflow/scripts/ccbr_atac_macs2_peak_calling.bash +++ b/workflow/scripts/ccbr_atac_macs2_peak_calling.bash @@ -41,7 +41,7 @@ callPeaks(){ for f in ${PREFIX}.unfiltered.narrowPeak ${PREFIX}.narrowPeak;do npeaks=$(wc -l $f|awk '{print $1}') if [ "$npeaks" -gt "0" ];then - Rscript ${SCRIPTSFOLDER}/ccbr_annotate_peaks.R -n $f -a ${f}.annotated -g $GENOME -l ${f}.genelist -f ${f}.annotation_summary + Rscript ${SCRIPTSFOLDER}/ccbr_annotate_peaks.R -n $f -a ${f}.annotated -g $GENOME -l ${f}.genelist -f ${f}.annotation_summary cut -f1,2 ${f}.annotation_summary > ${f}.annotation_distribution else touch ${f}.annotated @@ -59,9 +59,9 @@ callPeaks(){ # # MACS will save fragment pileup signal per million reads # -# Genrich normalization is as per 1x genome coverage... and macs2 normalization is per million reads +# Genrich normalization is as per 1x genome coverage... and macs2 normalization is per million reads # bedSort ${PREFIX}_treat_pileup.bdg ${PREFIX}_treat_pileup.bdg -# # some coordinates go out of chromosome size in macs2 bam2bw conversion.... using a working around +# # some coordinates go out of chromosome size in macs2 bam2bw conversion.... using a working around # awk -F"\t" -v OFS="\t" '{print $1,"0",$2}' $GENOMEFILE > /dev/shm/${PREFIX}.genome.bed # bedtools intersect -a ${PREFIX}_treat_pileup.bdg -b /dev/shm/${PREFIX}.genome.bed > /dev/shm/${PREFIX}.bg # mv /dev/shm/${PREFIX}.bg ${PREFIX}_treat_pileup.bdg @@ -82,7 +82,7 @@ callPeaks(){ # bedToBam -i ${nicksBED%.*}.tmp.bed -g $GENOMEFILE > $nicksBAM # samtools sort -@4 -o ${nicksBAM%.*}.sorted.bam $nicksBAM # mv ${nicksBAM%.*}.sorted.bam $nicksBAM -# rm -f ${nicksBED%.*}.tmp.bed +# rm -f ${nicksBED%.*}.tmp.bed # pigz -f -p4 $nicksBED # mv ${nicksBED}.gz ${OUTDIR}/tn5nicks/${nicksBED}.gz # mv ${nicksBAM} ${OUTDIR}/tn5nicks/${nicksBAM} @@ -108,7 +108,7 @@ Output files: ** tn5nicks BAM file (required for downstream DiffBind analysis ... in tn5nicks subfolder) ** bigwig files (in bigwig subfolder) * Pooled narrowPeak file with annotations -* Consensus BED file with annotations +* Consensus BED file with annotations """ source /opt2/argparse.bash || exit 1 argparse "$@" <> ame_results.txt mv ame_results.txt ${outdirbasename} rsync -az --progress --verbose ${outdirbasename}/ $OUTDIR/ -rm -rf *.* \ No newline at end of file +rm -rf *.* diff --git a/workflow/scripts/ccbr_atac_qc.bash b/workflow/scripts/ccbr_atac_qc.bash index 538fcbc..9443858 100755 --- a/workflow/scripts/ccbr_atac_qc.bash +++ b/workflow/scripts/ccbr_atac_qc.bash @@ -78,7 +78,7 @@ mv NRF_stats.txt.tmp NRF_stats.tsv #annotated2peakwidthdensity.sh #OUTPUT:.peak_width_density files in peak_annotation subfolder ##### -for f in `find . -name "*consensus*annotated"`;do +for f in `find . -name "*consensus*annotated"`;do echo $f python ${SCRIPTSDIR}/_qc_annotated2peakwidthdensity.py $f > ${f}.peak_width_density gzip -n $f @@ -113,7 +113,7 @@ rm -f $RANDOMTXT #OUTPUT:FLD_stats.txt ##### count=0 -delete_file_if_present FLD_stats_peaks.tsv +delete_file_if_present FLD_stats_peaks.tsv delete_file_if_present FLD_stats_fractions_ratios.tsv for f in `find . -name "*fld.txt"`;do count=$((count+1)) diff --git a/workflow/scripts/ccbr_atac_tagAlign2TSS.bash b/workflow/scripts/ccbr_atac_tagAlign2TSS.bash index 9e7f687..6610ef4 100644 --- a/workflow/scripts/ccbr_atac_tagAlign2TSS.bash +++ b/workflow/scripts/ccbr_atac_tagAlign2TSS.bash @@ -3,7 +3,7 @@ # conda activate python3 set -e -x -o pipefail -ARGPARSE_DESCRIPTION="calculate TSS score from tagAlign file" +ARGPARSE_DESCRIPTION="calculate TSS score from tagAlign file" source /opt2/argparse.bash || \ source <(curl -s https://raw.githubusercontent.com/CCBR/Tools/master/scripts/argparse.bash) || \ @@ -30,7 +30,7 @@ cp $TSSTARGZ . tar xzvf $(basename $TSSTARGZ) tar tzf $TSSTARGZ > beds.lst while read f;do -# for f in `ls *.bed|grep -v tn5nicks`;do +# for f in `ls *.bed|grep -v tn5nicks`;do echo "bedmap --echo --count $f $nicksbed|awk -F\"\t\" '{if (\$6!=\"+|0\" && \$6!=\"-|0\"){print \$4,\$6}}'|sed \"s/ /|/g\"|awk -F\"|\" -v OFS=\"\t\" '{print \$3,\$5}' > ${f}.counts" done < beds.lst > do_bedmap head do_bedmap @@ -55,4 +55,4 @@ fi echo "# TSS with 20 or more Tn5 nicking sites: $ntss" >> $TSSTXT -# conda deactivate \ No newline at end of file +# conda deactivate diff --git a/workflow/scripts/ccbr_atac_trim_align_pe.bash b/workflow/scripts/ccbr_atac_trim_align_pe.bash index 998a366..56f5c85 100644 --- a/workflow/scripts/ccbr_atac_trim_align_pe.bash +++ b/workflow/scripts/ccbr_atac_trim_align_pe.bash @@ -61,4 +61,4 @@ rm -f $trimmedfastq1 \ $trimmedfastq2 \ $noBLfastq1 \ $noBLfastq2 -fi \ No newline at end of file +fi diff --git a/workflow/scripts/ccbr_bam_filter_by_mapq.py b/workflow/scripts/ccbr_bam_filter_by_mapq.py index 22b1dc2..23ff49a 100755 --- a/workflow/scripts/ccbr_bam_filter_by_mapq.py +++ b/workflow/scripts/ccbr_bam_filter_by_mapq.py @@ -1,40 +1,45 @@ -import pysam,sys +import pysam, sys import argparse -parser = argparse.ArgumentParser(description='filter PE bamfile by mapQ values') -parser.add_argument('-i',dest='inBam',required=True,help='Input Bam File') -parser.add_argument('-o',dest='outBam',required=True,help='Output Bam File') -parser.add_argument('-q',dest='mapQ',type=int,required=False,help='mapQ value ... default 6',default=6) +parser = argparse.ArgumentParser(description="filter PE bamfile by mapQ values") +parser.add_argument("-i", dest="inBam", required=True, help="Input Bam File") +parser.add_argument("-o", dest="outBam", required=True, help="Output Bam File") +parser.add_argument( + "-q", + dest="mapQ", + type=int, + required=False, + help="mapQ value ... default 6", + default=6, +) args = parser.parse_args() samfile = pysam.AlignmentFile(args.inBam, "rb") -mapq=dict() +mapq = dict() for read in samfile.fetch(): - if read.is_unmapped: - continue - if read.is_supplementary: - continue - if read.is_secondary: - continue - if read.is_duplicate: - continue - if read.is_proper_pair: - if read.mapping_quality < args.mapQ and read.query_name in mapq: - del mapq[read.query_name] - if read.mapping_quality >= args.mapQ and not read.query_name in mapq: - mapq[read.query_name]=1 + if read.is_unmapped: + continue + if read.is_supplementary: + continue + if read.is_secondary: + continue + if read.is_duplicate: + continue + if read.is_proper_pair: + if read.mapping_quality < args.mapQ and read.query_name in mapq: + del mapq[read.query_name] + if read.mapping_quality >= args.mapQ and not read.query_name in mapq: + mapq[read.query_name] = 1 samfile.close() samfile = pysam.AlignmentFile(args.inBam, "rb") pairedreads = pysam.AlignmentFile(args.outBam, "wb", template=samfile) for read in samfile.fetch(): - if read.query_name in mapq: - if read.is_supplementary: - continue - if read.is_secondary: - continue - if read.is_duplicate: - continue - pairedreads.write(read) + if read.query_name in mapq: + if read.is_supplementary: + continue + if read.is_secondary: + continue + if read.is_duplicate: + continue + pairedreads.write(read) samfile.close() pairedreads.close() - - diff --git a/workflow/scripts/ccbr_bowtie2_align_pe.bash b/workflow/scripts/ccbr_bowtie2_align_pe.bash index 50f87f0..dc4bb72 100644 --- a/workflow/scripts/ccbr_bowtie2_align_pe.bash +++ b/workflow/scripts/ccbr_bowtie2_align_pe.bash @@ -21,7 +21,7 @@ parser.add_argument('--keepfiles',required=False,default='False',help='to keep i EOF # outputs -# ${samplename}.bowtie2.bam(.bai|.flagstat) for original bowtie2 alignment +# ${samplename}.bowtie2.bam(.bai|.flagstat) for original bowtie2 alignment # ${samplename}.qsorted.bam(.bai|.flagstat) for Genrich peak calling # ${samplename}.filtered.bam(.bai|.flagstat) for counting reads/tn5 nicks # ${samplename}.dedup.bam(.bai|.flagstat) used to generate tagAlign.gz for MACS2 peak calling diff --git a/workflow/scripts/ccbr_cutadapt_pe.bash b/workflow/scripts/ccbr_cutadapt_pe.bash index adaf71f..81b944f 100644 --- a/workflow/scripts/ccbr_cutadapt_pe.bash +++ b/workflow/scripts/ccbr_cutadapt_pe.bash @@ -41,4 +41,4 @@ $infastq1 $infastq2 # nreadstrim=`zcat $outfastq1|wc -l` # echo "$nreads $nreadstrim"|awk '{printf("%d\tInput Nreads\n%d\tAfter trimming\n",$1/2,$2/2)}' > ${SAMPLEINFO} -# conda deactivate \ No newline at end of file +# conda deactivate diff --git a/workflow/scripts/ccbr_frip.bash b/workflow/scripts/ccbr_frip.bash index 72e10d1..c203a7e 100644 --- a/workflow/scripts/ccbr_frip.bash +++ b/workflow/scripts/ccbr_frip.bash @@ -1,7 +1,7 @@ #!/bin/bash set -e -x -o pipefail -ARGPARSE_DESCRIPTION="fraction of reads in peaks (and DHS/Enhancers/Promoters [if genome is provided])" +ARGPARSE_DESCRIPTION="fraction of reads in peaks (and DHS/Enhancers/Promoters [if genome is provided])" source /opt2/argparse.bash || \ source <(curl -s https://raw.githubusercontent.com/CCBR/Tools/master/scripts/argparse.bash) || \ exit 1 @@ -48,4 +48,4 @@ echo -ne "$SAMPLENAME\tFRiEnh\t$frienh\n" fi fi -fi \ No newline at end of file +fi diff --git a/workflow/scripts/ccbr_get_consensus_peaks.py b/workflow/scripts/ccbr_get_consensus_peaks.py index 581eb98..b407f63 100644 --- a/workflow/scripts/ccbr_get_consensus_peaks.py +++ b/workflow/scripts/ccbr_get_consensus_peaks.py @@ -2,7 +2,9 @@ import argparse import uuid import pandas -parser = argparse.ArgumentParser(description=""" + +parser = argparse.ArgumentParser( + description=""" Get consensus peaks from multiple narrowPeak files This is similar to the consensus MAX peaks from https://dx.doi.org/10.5936%2Fcsbj.201401002 @@ -13,84 +15,133 @@ module load bedtools module load ucsc -""",formatter_class=argparse.RawTextHelpFormatter) -parser.add_argument('--peakfiles', required = True, nargs = '+', help = 'space separated list of peakfiles') -parser.add_argument('--outbed', required = True, dest = 'outbed', help = 'consensus output bed file') -parser.add_argument('--filter', required = False, default=0.5, dest = 'filter', help = 'min. fraction of replicates that should be represented in each peak ') -parser.add_argument('--nofilter', action = 'store_true', default = 'store_false',required = False,dest = 'nofilter',help = ' do not filter keep all peaks with score') - - -deleteFiles=[] +""", + formatter_class=argparse.RawTextHelpFormatter, +) +parser.add_argument( + "--peakfiles", required=True, nargs="+", help="space separated list of peakfiles" +) +parser.add_argument( + "--outbed", required=True, dest="outbed", help="consensus output bed file" +) +parser.add_argument( + "--filter", + required=False, + default=0.5, + dest="filter", + help="min. fraction of replicates that should be represented in each peak ", +) +parser.add_argument( + "--nofilter", + action="store_true", + default="store_false", + required=False, + dest="nofilter", + help=" do not filter keep all peaks with score", +) + + +deleteFiles = [] args = parser.parse_args() print(args) -out=open(args.outbed,'w') +out = open(args.outbed, "w") -filter=float(args.filter) +filter = float(args.filter) -rand_name=str(uuid.uuid4()) +rand_name = str(uuid.uuid4()) # concat -cmd="cat" +cmd = "cat" for p in args.peakfiles: - cmd+=" "+p -cmd+=" | cut -f1-3 > "+rand_name+".concat.bed" + cmd += " " + p +cmd += " | cut -f1-3 > " + rand_name + ".concat.bed" -deleteFiles.append(rand_name+".concat.bed") +deleteFiles.append(rand_name + ".concat.bed") print(cmd) os.system(cmd) # sort and merge -cmd="bedSort "+rand_name+".concat.bed "+rand_name+".concat.bed && bedtools merge -i "+rand_name+".concat.bed > "+rand_name+".merged.bed" - -deleteFiles.append(rand_name+".merged.bed") +cmd = ( + "bedSort " + + rand_name + + ".concat.bed " + + rand_name + + ".concat.bed && bedtools merge -i " + + rand_name + + ".concat.bed > " + + rand_name + + ".merged.bed" +) + +deleteFiles.append(rand_name + ".merged.bed") print(cmd) os.system(cmd) # check merged count -npeaks=len(open(rand_name+".merged.bed").readlines()) -if (npeaks==0): - out.close() - exit("Number of merged peaks = 0") +npeaks = len(open(rand_name + ".merged.bed").readlines()) +if npeaks == 0: + out.close() + exit("Number of merged peaks = 0") -count=0 +count = 0 for p in args.peakfiles: - count+=1 - sortedfile=p + ".sorted." + rand_name - countfile=p + ".counts." + rand_name - cmd = "cut -f1-3 " + p + " > " + sortedfile + " && bedSort " + sortedfile + " " + sortedfile - print(cmd) - os.system(cmd) - cmd = "bedmap --delim '\t' --echo-ref-name --count " + rand_name + ".merged.bed " + sortedfile + "|awk -F'\t\' -v OFS='\t' '{if ($2>1){$2=1}{print}}' > " + countfile - print(cmd) - os.system(cmd) - deleteFiles.append(countfile) - deleteFiles.append(sortedfile) - if count==1: - df=pandas.read_csv(countfile,delimiter="\t") - df.columns=["peakid",countfile] - else: - dfx=pandas.read_csv(countfile,delimiter="\t") - dfx.columns=["peakid",countfile] - df=df.merge(dfx,on="peakid") - -df=df.set_index("peakid") -df=df.sum(axis=1)/len(df.columns) -df=pandas.DataFrame({'peakid':df.index,'score':df.values}) + count += 1 + sortedfile = p + ".sorted." + rand_name + countfile = p + ".counts." + rand_name + cmd = ( + "cut -f1-3 " + + p + + " > " + + sortedfile + + " && bedSort " + + sortedfile + + " " + + sortedfile + ) + print(cmd) + os.system(cmd) + cmd = ( + "bedmap --delim '\t' --echo-ref-name --count " + + rand_name + + ".merged.bed " + + sortedfile + + "|awk -F'\t' -v OFS='\t' '{if ($2>1){$2=1}{print}}' > " + + countfile + ) + print(cmd) + os.system(cmd) + deleteFiles.append(countfile) + deleteFiles.append(sortedfile) + if count == 1: + df = pandas.read_csv(countfile, delimiter="\t") + df.columns = ["peakid", countfile] + else: + dfx = pandas.read_csv(countfile, delimiter="\t") + dfx.columns = ["peakid", countfile] + df = df.merge(dfx, on="peakid") + +df = df.set_index("peakid") +df = df.sum(axis=1) / len(df.columns) +df = pandas.DataFrame({"peakid": df.index, "score": df.values}) for index, row in df.iterrows(): - chrom,coords=row["peakid"].split(":") - start,end=coords.split("-") - if args.nofilter==True: - out.write("%s\t%s\t%s\t%s\t%.3f\t.\n"%(chrom,start,end,row["peakid"],float(row["score"]))) - elif float(row["score"])>filter: - out.write("%s\t%s\t%s\t%s\t%.3f\t.\n"%(chrom,start,end,row["peakid"],float(row["score"]))) + chrom, coords = row["peakid"].split(":") + start, end = coords.split("-") + if args.nofilter == True: + out.write( + "%s\t%s\t%s\t%s\t%.3f\t.\n" + % (chrom, start, end, row["peakid"], float(row["score"])) + ) + elif float(row["score"]) > filter: + out.write( + "%s\t%s\t%s\t%s\t%.3f\t.\n" + % (chrom, start, end, row["peakid"], float(row["score"])) + ) out.close() for f in deleteFiles: - os.remove(f) - - + os.remove(f) diff --git a/workflow/scripts/ccbr_jaccard_pca.bash b/workflow/scripts/ccbr_jaccard_pca.bash index c8d4947..8f3a17a 100644 --- a/workflow/scripts/ccbr_jaccard_pca.bash +++ b/workflow/scripts/ccbr_jaccard_pca.bash @@ -22,4 +22,4 @@ then else cp ${SCRIPTSFOLDER}/pairwise.Rmd . Rscript -e "rmarkdown::render(\"pairwise.Rmd\",output_file=\"${PCAHTML}\", params = list(pairwise=\"${PAIRWISE}\",inputfilelist=\"${INPUTFILELIST}\"))" -fi \ No newline at end of file +fi diff --git a/workflow/scripts/ccbr_remove_blacklisted_reads_pe.bash b/workflow/scripts/ccbr_remove_blacklisted_reads_pe.bash index e1469ea..7020457 100644 --- a/workflow/scripts/ccbr_remove_blacklisted_reads_pe.bash +++ b/workflow/scripts/ccbr_remove_blacklisted_reads_pe.bash @@ -42,4 +42,4 @@ pigz -f -p $ncpus ${outfastq2_nogz} rm -f ${samplename}.notAlignedToBlacklist.sam \ ${samplename}.notAlignedToBlacklist.bam \ -${samplename}.notAlignedToBlacklist.qsorted.bam \ No newline at end of file +${samplename}.notAlignedToBlacklist.qsorted.bam diff --git a/workflow/scripts/fixed_width_peakSets_to_consensus_peakSet.R b/workflow/scripts/fixed_width_peakSets_to_consensus_peakSet.R index 678d3c9..fc49a2d 100644 --- a/workflow/scripts/fixed_width_peakSets_to_consensus_peakSet.R +++ b/workflow/scripts/fixed_width_peakSets_to_consensus_peakSet.R @@ -4,31 +4,31 @@ suppressPackageStartupMessages(library("argparse")) # create parser object parser <- ArgumentParser(description="Convert narrowPeak output to fixedwidth peakset as per the pan-cancer genome paper (https://doi.org/10.1126/science.aav1898) using the script narrowPeak_to_fixed_width_peakSet.R. This involves ...First, all replicate peak sets are combined into a cumulative peak set and trimmed for overlap using the same iterative procedure as narrowPeak_to_fixed_width_peakSet, i.e, keep the most significant (based of score per million or normalized p-value) peak and discards any peak that overlaps directly with the most significant peak. To identify reproducible peaks from this merged peak set, the individual replicate peak sets were overlapped with the merged peak set. Peaks from the merged peak set that were observed in at least 2(minReps) samples with a score per million value >=5(cutoffSPM) were labeled as reproducible and reported in the output narrowPeak file. NOTE: this script requires bedtools to be in the PATH") -# specify our desired options -# by default ArgumentParser will add a help option -parser$add_argument("-i", "--inputNarrowPeaks", - type="character", +# specify our desired options +# by default ArgumentParser will add a help option +parser$add_argument("-i", "--inputNarrowPeaks", + type="character", help="absolute fullpaths to fixed width narrowPeak input files from narrowPeak_to_fixed_width_peakSet.R", required=TRUE) -parser$add_argument("-p", "--inputPrefixes", - type="character", +parser$add_argument("-p", "--inputPrefixes", + type="character", help="prefixes to be added to the peaknames from the inputNarrowPeaks to avoid name clashes", required=TRUE) -parser$add_argument("-o", "--outputNarrowPeak", - type="character", +parser$add_argument("-o", "--outputNarrowPeak", + type="character", help="absolute fullpath to narrowPeak output file", required=TRUE) -parser$add_argument("-t", "--tmpdir", +parser$add_argument("-t", "--tmpdir", type="character", help="tmp dir", required=FALSE, default=NULL) -parser$add_argument("-m", "--minReps", +parser$add_argument("-m", "--minReps", type="integer", help="minimum replicates that need to overlap for any peak to be considered a consensus peak", required=FALSE, default=2) -parser$add_argument("-c", "--cutoffSPM", +parser$add_argument("-c", "--cutoffSPM", type="integer", help="cutoff score per million or normalized p-value. All minimum number of replicate peaks that need to overlap with a consensus peak should also be greater than this cutoff score per million or SPM", required=FALSE, @@ -63,7 +63,7 @@ if (debug==1){ tmpdir=setwd(dirname(out_narrowPeak)) } else { tmpdir=args$tmpdir - } + } setwd(tmpdir) } @@ -219,7 +219,7 @@ for (i in 1:length(narrowPeaks)) { x2[x2$normalizedpvalue2>norm_pvalue_threshold,]->x2 update_peaks=peaksoverlapcount$peakname %in% x2$peakname peaksoverlapcount[update_peaks, "count"] <- 1 + peaksoverlapcount[update_peaks, "count"] - + system(paste("rm -f",tmp)) } diff --git a/workflow/scripts/narrowPeak_normalize_pvalues.R b/workflow/scripts/narrowPeak_normalize_pvalues.R index f681287..338810b 100644 --- a/workflow/scripts/narrowPeak_normalize_pvalues.R +++ b/workflow/scripts/narrowPeak_normalize_pvalues.R @@ -4,15 +4,15 @@ suppressPackageStartupMessages(library("argparse")) # create parser object parser <- ArgumentParser(description="This script can be used to normalize p-values of individual peaks so that replicates and sample with different read-depths and number of peaks can be directly compared with each other as per the pan-cancer genome paper (https://doi.org/10.1126/science.aav1898). To normalize, the peak scores (-log10(p-value)) for each sample were converted to a score per million by dividing each individual peak score by the sum of all of the peak scores in the given sample divided by 1 million.") -# specify our desired options -# by default ArgumentParser will add an help option +# specify our desired options +# by default ArgumentParser will add an help option -parser$add_argument("-i", "--inputNarrowPeak", - type="character", +parser$add_argument("-i", "--inputNarrowPeak", + type="character", help="narrowPeak input file", required=TRUE) -parser$add_argument("-o", "--outputNarrowPeak", - type="character", +parser$add_argument("-o", "--outputNarrowPeak", + type="character", help="narrowPeak output file", required=FALSE, default=NULL) @@ -39,7 +39,7 @@ setwd(dirname(narrowPeak)) tmpdir=setwd(dirname(narrowPeak)) } else { tmpdir=args$tmpdir - } + } setwd(tmpdir) } bn=basename(narrowPeak) diff --git a/workflow/scripts/narrowPeak_to_fixed_width_peakSet.R b/workflow/scripts/narrowPeak_to_fixed_width_peakSet.R index 5041e8c..4506429 100644 --- a/workflow/scripts/narrowPeak_to_fixed_width_peakSet.R +++ b/workflow/scripts/narrowPeak_to_fixed_width_peakSet.R @@ -4,24 +4,24 @@ suppressPackageStartupMessages(library("argparse")) # create parser object parser <- ArgumentParser(description="Convert narrowPeak output to fixedwidth peakset as per the pan-cancer genome paper (https://doi.org/10.1126/science.aav1898). This involves ... First, the most significant peak is kept and any peak that directly overlaps with that significant peak is removed. Then, this process iterates to the next most significant peak and so on until all peaks have either been kept or removed due to direct overlap with a more significant peak. This prevents the removal of peaks due to daisy chaining or indirect overlap and simultaneously maintains a compendium of fixed-width peaks. The significance of each peak is determined by normalized p-values. To normalize, the peak scores (-log10(p-value)) for each sample were converted to a score per million by dividing each individual peak score by the sum of all of the peak scores in the given sample divided by 1 million. NOTE: this script requires bedtools to be in the PATH") -# specify our desired options -# by default ArgumentParser will add an help option +# specify our desired options +# by default ArgumentParser will add an help option -parser$add_argument("-i", "--inputNarrowPeak", - type="character", +parser$add_argument("-i", "--inputNarrowPeak", + type="character", help="narrowPeak input file absolute full path", required=TRUE) -parser$add_argument("-o", "--outputNarrowPeak", - type="character", +parser$add_argument("-o", "--outputNarrowPeak", + type="character", help="narrowPeak output file absolute full path", required=FALSE, default=NULL) -parser$add_argument("-t", "--tmpdir", +parser$add_argument("-t", "--tmpdir", type="character", help="tmp dir", required=FALSE, default=NULL) -parser$add_argument("-w", "--peakWidth", +parser$add_argument("-w", "--peakWidth", type="integer", help="desired peak width", required=FALSE, @@ -53,7 +53,7 @@ if (debug==1){ tmpdir=setwd(dirname(narrowPeak)) } else { tmpdir=args$tmpdir - } + } setwd(tmpdir) } bn=basename(narrowPeak) diff --git a/workflow/scripts/pairwise.Rmd b/workflow/scripts/pairwise.Rmd index 7aa0012..b3abfd1 100644 --- a/workflow/scripts/pairwise.Rmd +++ b/workflow/scripts/pairwise.Rmd @@ -52,7 +52,7 @@ toplot=as.data.frame(m.pca$x[,1:3]) p <- plot_ly(as.data.frame(m.pca$x[,1:3]), x = ~PC1, y = ~PC2, z = ~PC3, hoverinfo="text", color=h, hovertext = rownames(toplot)) %>% add_markers() %>% - layout(title = "Jaccard score 3-d PCA plot", + layout(title = "Jaccard score 3-d PCA plot", scene = list(xaxis = list(title = paste0("PC1 (",m.pc1,"%)")), yaxis = list(title = paste0("PC2 (",m.pc2,"%)")), zaxis = list(title = paste0("PC3 (",m.pc3,"%)")))) @@ -78,4 +78,4 @@ p <- plot_ly(as.data.frame(m.pca$x[,1:2]), x = ~PC1, y = ~PC2, hoverinfo="text" add_markers() %>% layout(title = "Jaccard score 2-d PCA plot", xaxis = x, yaxis = y) p -``` \ No newline at end of file +```