DeepRegFinder/preprocessing_data.yaml at master · shenlab-sinai/DeepRegFinder · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# Data for Genome Processing.
genome: hg38

# Would you like to split the train, val and test set by chromosomes? If so, fill the
# following lists. If not, leave the lists empty
train_chrom: ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7','chr8', 'chr9']
val_chrom: ['chr10', 'chr11', 'chr12', 'chr13','chr14', 'chr15', 'chr16', 'chr17']
test_chrom: ['chr18', 'chr19', 'chr20','chr21', 'chr22', 'chrX', 'chrY', 'chrM']

window_width: 100
number_of_windows: 20
# the above defines a 20x100=2Kb region for enhancer and promoter prediction.

num_classes: 2
# choose between 2 or 5. 2 identifies enhancers
# whereas 5 identifies Poised Enh, Active Enh, Poised TSS and Active TSS regions in the genome.

# genome_size_file provides chromosome names and their sizes as a TAB-delimited text file.
# It can be used to select a subset of chromosomes. This is not a necessary argument.
# The main purpose of the argument is to allow you to select a subset of the chromosomes
# to quickly test out the whole pipeline.
# If not specified, the program will use all chromosomes in the "valid_chromosomes" argument.
genome_size_file: 'example_dat/genome/hg38.chr1.txt'

# valid_chromosomes specifies the chromosome names that are "valid". This serves to remove those
# scaffold or undetermined chromosomes from training and prediction.
valid_chromosomes: ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7',
                    'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13',
                    'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20',
                    'chr21', 'chr22', 'chrX', 'chrY', 'chrM']

# Path to the folder that has bam files for histone marks.
# Should be of the following directory structure:
# histones/
#     histone1/
#         histone1_rep1.bam
#         histone1_rep2.bam
#     histone2/
#         histone2_rep1.bam
#         histone2_rep2.bam
# Note: histone files must be of format xxx_repN.bam where N represents the rep number
#       Even when you have only one replicate, name it as xxx_rep1.bam.
histone_folder : 'example_dat/histones'
histone_log_transformation: False # Whether you want to log transform the reads that are
                                  # used for training and predicting enhancers and promoters.
                                  # Some evidence shows that it's better not to use
                                  # log-transformation.
#==== Important: ====#
# If you are preprocessing the prediction dataset only, make sure you use the same set of
# histone marks on which your models are trained. The sub-directory names for the histone marks
# are case sensitive and shall be the same between training and prediction.

# Performance related parameters.
cpu_threads: 6 # number of CPU cores to use.

# If you already have a trained model and are building data for prediction only,
# set this to True, then the following parameters become irrelevant.
generate_prediction_only: False


#============= The following are required for training only =============#
# The following parameters are related to genomic annotation and model training.
# Leave them alone if you are only building data for prediction.

# Path to transcriptional start site (TSS) BED file. The BED file shall contain regions of 1bp.
tss_file: 'example_dat/genome/gencode_v32_tss.bed'
distal_bp_distance : 1000 # use to slop TSSs.

# Enhancer data
enhancer_distal_bp_distance: 2000 # enhancers must be this distance away from TSS.
                                  # it is also used to select background windows away from
                                  # DNA regulatory elements.

# H3K4me3 peaks are used as a supplement to the TSSs defined above so that we do not bump into
# these potential promoters.
H3K4me3_file: 'example_dat/peak_lists/h1-h3k4me3-peaks.bed'

# Path to DHS (or any other DNA accessibility such as ATAC-seq) peak file.
DHS_file: 'example_dat/peak_lists/h1-dnase-seq-peaks.bed'

# Path to enhancer peak files. You can use more than one peak lists, such as
# both p300 and CBP to define enhancers.
enhancer_files:
 - 'example_dat/peak_lists/h1-p300-narrowpeaks.bed'

# Transcription factor binding site files
# These shall be the peak lists for the TFs that you believe to be enhancer
# associated. In H1 stem cells, Nanog, Oct4 and Sox2 are the master regulators
# that are considered to regulate enhancers.
# If there are no TFBS files, just comment the following lines.
TFBS:
 - 'example_dat/tfbs/nanog.bed'
 - 'example_dat/tfbs/oct4.bed'

# Model training related parameters.
bkg_samples: 30000 # number of bkg samples.
nz_cutoff: 5 # number of nonzero bins in a region; this cutoff is used to discard
             # regions with low information content, mostly empty background regions.
val_p: 0.2 # validation set proportion.
test_p: 0.2 # test set proportion.

# Use when num_class is 5. Else, comment out following lines
# Path to data with the sense and antisense GRO-seq bam file.
# When you don't have strand-specific GRO-seq, comment the following two
# lines and use "groseq_bam_file".
sense_bam_file : 'example_dat/groseq/h1-groseq-sense.bam'
antisense_bam_file : 'example_dat/groseq/h1-groseq-antisense.bam'
# groseq_bam_file: None  # for non-strand-specific GRO-seq.
groseq_log_transformation: True # Whether you want to log transform the reads
                                # to help K-means clustering.

# Option to delete intermediate files
delete_intermediate_files: True