Automation_scripts/desq2.py at main · elichter/Automation_scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# This script will take the mpileup data from all samples in the directory. It
# will concatenate them into a single dataframe, and normalize by sequencing
# depth at each position using the Deseq2 logic.

# First all the imports are done.
import os
import pandas as pd
import numpy as np
from scipy.stats import gmean

# Define path and files.
path = os.getcwd()
files = os.listdir(path)
ts = [f for f in files if f[-8:] == '.bam.tsv']

# Create an empty list to concatenate the data to.
df = []

# Add a column "Name" to each tsv file, and populate it down with the
# basename in the first column (0).
for f in ts:
    data = pd.read_csv(f, sep='\t')
    if 'Name' in data.columns:
        continue
    else:
        a = data.insert(0, "Name", (os.path.basename(f)))

        # The 'Name' will have everything after '_' stripped and only the
        # zeroth string kept. The individual dataframes are saved to excel.
        data['Name'] = data['Name'].str.split('_').str[0]
        header = ['Name','Ref Genome', 'Reference Position', 'Ref Allele', 'Depth',
                'Reads', 'Qual']
        data.columns = header[:len(data.columns)]
        df.append(data)

# Concatenate the list created above into a dataframe.
df = pd.concat(df, axis=0, ignore_index=True)
# Pivot so that the index is the genome position, columns are the names,
# and depth are the values.
pivot = df.pivot(index='Reference Position', columns = 'Name',
        values='Depth')

# Calculate the geomatric mean for the depth at each genome position.
pivot['Pseudo Ref'] = gmean(pivot, axis=1)

# Create a new pivot 'pivot1', calculate the ratio of each sample to the pseudo
# reference. Since this also divides the pseudo reference by itself, copy the
# original pseudo reference from the original pivot in here so that the pseudo
# reference column will contain the actual pseudo reference, not 1s.
pivot1 = pivot.div(pivot['Pseudo Ref'], axis=0)
pivot1['Pseudo Ref'] = pivot['Pseudo Ref']

# Reset the index so that the reference position is visible upon export to
# excel and export to excel.
pivot1.reset_index()
pivot1.to_excel('pivot1.xlsx')

# Calculate the normalization factor for each sample by taking the median of
# the ratio of each sample to the reference genome across the entire reference.
# Put the results into a series called 'Cols'. This series is then converted to
# a dataframe with its index reset, and the headers 'Name' and 'Norm_factor'
# are added to it.
Cols = pivot1.median(axis=0)
Cols = pd.DataFrame(Cols).reset_index()
header = ['Name', 'Norm_factor']
Cols.columns = header[:len(Cols.columns)]

# The dataframe containing the mutations (in this case read in as df) has the
# coverage column mapped to Cols based on the fact that 'Name' matches. The
# coverage is then divided by the normalization factor.
df = pd.read_excel('df2.xlsx')
df['Name'] = df['Name'].str.split('_').str[0]
df['Coverage'] = df['Coverage'] / df['Name'].map(Cols.set_index(['Name'])['Norm_factor'])

# Add count column.
df['count'] = 0
# Group and count.
grouped = df.groupby(['Region', 'Name']).count().reset_index()

# Map the count column to Cols based on Name, and divide by the normalization
# factor to give normalized counts.
grouped['count'] = grouped['count'] / grouped['Name'].map(Cols.set_index(['Name'])['Norm_factor'])
print(grouped)
df.to_excel('deseq2.xlsx',index=False)