Commit 85ffda59 authored by Beatriz Vicoso's avatar Beatriz Vicoso
Browse files

Add new file

parent 3808d7b9
####PICK FACTOR TO UPREGULATE DIPLOID GENES BY (1 = no upregulation, 1.6 = similar to our inferred upregulation)
upfactor<-1
###LOAD DATA
counts<-read.table("~/Documents/Will/Carp/Carp2022/Submission/Resubmission/Counts/3b-Sleuth_noBadSamples/2-Orthogroups/SleuthSexBias/Barb_Brain/KalFiles/Barb_F_Brain_24/abundance.tsv", head=T, sep="\t")
tet2<-read.table("~/Documents/Will/Carp/Carp2022/Submission/Resubmission/Counts/Barb_Carp_1to2_exhaust_correctChrom.txt", sep=" ", head=F)[,2]
dip2<-read.table("~/Documents/Will/Carp/Carp2022/Submission/Resubmission/Counts/Barb_Carp_1to1_exhaust.txt", sep=" ", head=F)[,2]
numtet<-dim(as.data.frame(tet2))[1]
numdip<-dim(as.data.frame(dip2))[1]
numtot<-numtet+numdip
###SANITY CHECK:
###RESTIMATE BARB TPM FROM ORIGINAL COUNTS TABLE
counts$barbtp<- 1000*(counts$est_counts / counts$eff_length)
counts$barbtpm<- ( counts$barbtp / colSums(as.data.frame(counts[,6])) ) * 10^6
### IT WORKS WE RECOVER THE SAME TPM
################################################################
###NOW LET"S MAKE PSEUDOTETRAPLOID COUNTS
###MAKE TABLE OF COUNTS WITH GENES USED IN ANALYSIS, STILL ONLY IN BARB
tet<-merge(counts, as.data.frame(tet2), by.x="target_id", by.y="tet2")
dip<-merge(counts, as.data.frame(dip2), by.x="target_id", by.y="dip2")
head(dip)
head(tet)
allcounts<-rbind(dip, tet)
###RESTIMATE BARB TPM
allcounts$barbtp<- 1000*(allcounts$est_counts / allcounts$eff_length)
allcounts$barbtpm<- ( allcounts$barbtp / colSums(as.data.frame(allcounts[,6])) ) * 10^6
#################################################################
###MAKE TETRAPLOID WITH SOME REDIPLOIDIZATION
### WE ASSUME ALL LOSS WAS FROM SUBB SINCE IT MAKES NO DIFFERENCE TO CALCULATIONS
subA<-allcounts
### Keep only the number of tetraploid genes in subB
# Get a vector of 7147 numbers from 1 to total number of genes
randomselect <- sample(1:numtot, numtet, replace=FALSE)
subB<-allcounts[randomselect,]
### NOW WE SEPARATE DIPLOID AND TETRAPLOID SUBA GENES AND UPREGULATE DIPLOID (OR NOT)
#GENES FOUND IN BOTH SUBGENOMES ARE TETRAPLOID
tetraploidA<-merge(subA, subB[,1:2], by.x="target_id", by.y="target_id")[,-8]
colnames(tetraploidA)<-c("target_id", "length", "eff_length", "est_counts", "tpm", "barbtp", "barbtpm")
#GENES FOUND IN SubA but not tetraploid set are diploid
tetnames<-tetraploidA[,1]
diploidA<-(subA[!(subA[,1] %in% tetnames),])
head(tetraploidA)
head(diploidA)
diploidA$est_counts<-upfactor*diploidA$est_counts
head(diploidA)
subA<-rbind(tetraploidA, diploidA)
## Table with all fake-carp genes so we can get sum of counts for TPM calculations
subAB<-rbind(subA, subB)
sumABcounts<-colSums(as.data.frame(subAB[,6]))
## Reestimate TPM of subA and subB genes, using the sum of counts of both
subA$carptp<- 1000*(subA$est_counts / subA$eff_length)
subA$carptpm<- ( subA$carptp / colSums(as.data.frame(subAB[,6])) ) * 10^6
subB$carptp<- 1000*(subB$est_counts / subB$eff_length)
subB$carptpm<- ( subB$carptp / colSums(as.data.frame(subAB[,6])) ) * 10^6
### MAkE REDIPLOIDIZED AND TETRAPLOID TABLES
#GENES FOUND IN BOTH SUBGENOMES ARE TETRAPLOID
tetraploid<-merge(subA, subB, by.x="target_id", by.y="target_id")
colnames(tetraploid)<-c("target_id", "length.x", "eff_length.x", "est_counts.A", "tpmKallisto", "barbtp.x", "barbtpm.x", "carptp.A", "carptpm.A", "length.y", "eff_length.y", "est_counts.B", "tpm.Kal.y", "barbtp.y", "barbtpm.y", "carptp.B", "carptpm.B")
#GENES FOUND IN SubA but not tetraploid set are diploid
tetnames<-tetraploid[,1]
diploid<-(subA[!(subA[,1] %in% tetnames),])
boxcols<-c( "#A6CEE3", "#B2DF8A", "#1F78B4", "#FF7F00")
boxnames<-c("subA", "subB", "A+B", "1:1")
boxplot(tetraploid$carptpm.A, tetraploid$carptpm.B, tetraploid$carptpm.A+tetraploid$carptpm.B, diploid$carptpm, notch=T, outline=F, names=boxnames, col=boxcols, main="A) No upregulation", ylab="Log2(TPM)")
abline(h=median(tetraploid$carptpm.A+tetraploid$carptpm.B, na.rm=T), lty=2)
ycoord<-(quantile(diploid$carptpm, 0.75, na.rm=T)-quantile(diploid$carptpm, 0.25, na.rm=T))*1.5+quantile(diploid$carptpm, 0.75, na.rm=T)-0.03
#segments(1, ycoord, 4, ycoord, col="darkgrey")
#add stats
ptable<-cbind(wilcox.test(tetraploid$carptpm.A, diploid$carptpm)$p.value, wilcox.test(tetraploid$carptpm.B, diploid$carptpm)$p.value, wilcox.test(tetraploid$carptpm.A+tetraploid$carptpm.B, diploid$carptpm)$p.value)
ptable
ptable[ ptable <0.001 ] <- "***"
ptable[ ptable > 0 & ptable <0.01 ] <- "**"
ptable[ ptable > 0 & ptable <0.05 ] <- "*"
ptable[ ptable > 0 & ptable >0.05 ] <- " "
ptable
text(1,ycoord-0.35, ptable[1], cex=1.3)
text(2,ycoord-0.35, ptable[2], cex=1.3)
text(3,ycoord-0.35, ptable[3], cex=1.3)
median(diploid$carptpm, na.rm=T)/median(tetraploid$carptpm.A+tetraploid$carptpm.B, na.rm=T)
####PICK FACTOR TO UPREGULATE DIPLOID GENES BY (1 = no upregulation, 1.6 = similar to our inferred upregulation)
upfactor<-1.6
###SANITY CHECK:
###RESTIMATE BARB TPM FROM ORIGINAL COUNTS TABLE
counts$barbtp<- 1000*(counts$est_counts / counts$eff_length)
counts$barbtpm<- ( counts$barbtp / colSums(as.data.frame(counts[,6])) ) * 10^6
### IT WORKS WE RECOVER THE SAME TPM
################################################################
###NOW LET"S MAKE PSEUDOTETRAPLOID COUNTS
###MAKE TABLE OF COUNTS WITH GENES USED IN ANALYSIS, STILL ONLY IN BARB
tet<-merge(counts, as.data.frame(tet2), by.x="target_id", by.y="tet2")
dip<-merge(counts, as.data.frame(dip2), by.x="target_id", by.y="dip2")
head(dip)
head(tet)
allcounts<-rbind(dip, tet)
###RESTIMATE BARB TPM
allcounts$barbtp<- 1000*(allcounts$est_counts / allcounts$eff_length)
allcounts$barbtpm<- ( allcounts$barbtp / colSums(as.data.frame(allcounts[,6])) ) * 10^6
#################################################################
###MAKE TETRAPLOID WITH SOME REDIPLOIDIZATION
### WE ASSUME ALL LOSS WAS FROM SUBB SINCE IT MAKES NO DIFFERENCE TO CALCULATIONS
subA<-allcounts
### Keep only the number of tetraploid genes in subB
# Get a vector of 7147 numbers from 1 to total number of genes
randomselect <- sample(1:numtot, numtet, replace=FALSE)
subB<-allcounts[randomselect,]
### NOW WE SEPARATE DIPLOID AND TETRAPLOID SUBA GENES AND UPREGULATE DIPLOID (OR NOT)
#GENES FOUND IN BOTH SUBGENOMES ARE TETRAPLOID
tetraploidA<-merge(subA, subB[,1:2], by.x="target_id", by.y="target_id")[,-8]
colnames(tetraploidA)<-c("target_id", "length", "eff_length", "est_counts", "tpm", "barbtp", "barbtpm")
#GENES FOUND IN SubA but not tetraploid set are diploid
tetnames<-tetraploidA[,1]
diploidA<-(subA[!(subA[,1] %in% tetnames),])
head(tetraploidA)
head(diploidA)
diploidA$est_counts<-upfactor*diploidA$est_counts
head(diploidA)
subA<-rbind(tetraploidA, diploidA)
## Table with all fake-carp genes so we can get sum of counts for TPM calculations
subAB<-rbind(subA, subB)
sumABcounts<-colSums(as.data.frame(subAB[,6]))
## Reestimate TPM of subA and subB genes, using the sum of counts of both
subA$carptp<- 1000*(subA$est_counts / subA$eff_length)
subA$carptpm<- ( subA$carptp / colSums(as.data.frame(subAB[,6])) ) * 10^6
subB$carptp<- 1000*(subB$est_counts / subB$eff_length)
subB$carptpm<- ( subB$carptp / colSums(as.data.frame(subAB[,6])) ) * 10^6
### MAkE REDIPLOIDIZED AND TETRAPLOID TABLES
#GENES FOUND IN BOTH SUBGENOMES ARE TETRAPLOID
tetraploid<-merge(subA, subB, by.x="target_id", by.y="target_id")
colnames(tetraploid)<-c("target_id", "length.x", "eff_length.x", "est_counts.A", "tpmKallisto", "barbtp.x", "barbtpm.x", "carptp.A", "carptpm.A", "length.y", "eff_length.y", "est_counts.B", "tpm.Kal.y", "barbtp.y", "barbtpm.y", "carptp.B", "carptpm.B")
#GENES FOUND IN SubA but not tetraploid set are diploid
tetnames<-tetraploid[,1]
diploid<-(subA[!(subA[,1] %in% tetnames),])
boxcols<-c( "#A6CEE3", "#B2DF8A", "#1F78B4", "#FF7F00")
boxnames<-c("subA", "subB", "A+B", "1:1")
boxplot(tetraploid$carptpm.A, tetraploid$carptpm.B, tetraploid$carptpm.A+tetraploid$carptpm.B, diploid$carptpm, notch=T, outline=F, names=boxnames, col=boxcols, main="B) 1.6x upregulation", ylab="Log2(TPM)")
abline(h=median(tetraploid$carptpm.A+tetraploid$carptpm.B, na.rm=T), lty=2)
ycoord<-(quantile(diploid$carptpm, 0.75, na.rm=T)-quantile(diploid$carptpm, 0.25, na.rm=T))*1.5+quantile(diploid$carptpm, 0.75, na.rm=T)-0.03
#segments(1, ycoord, 4, ycoord, col="darkgrey")
#add stats
ptable<-cbind(wilcox.test(tetraploid$carptpm.A, diploid$carptpm)$p.value, wilcox.test(tetraploid$carptpm.B, diploid$carptpm)$p.value, wilcox.test(tetraploid$carptpm.A+tetraploid$carptpm.B, diploid$carptpm)$p.value)
ptable
ptable[ ptable <0.001 ] <- "***"
ptable[ ptable > 0 & ptable <0.01 ] <- "**"
ptable[ ptable > 0 & ptable <0.05 ] <- "*"
ptable[ ptable > 0 & ptable >0.05 ] <- " "
ptable
text(1,ycoord-0.35, ptable[1], cex=1.3)
text(2,ycoord-0.35, ptable[2], cex=1.3)
text(3,ycoord-0.35, ptable[3], cex=1.3)
median(diploid$carptpm, na.rm=T)/median(tetraploid$carptpm.A+tetraploid$carptpm.B, na.rm=T)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment