Commit bea23deb authored by Marwan ELKREWI's avatar Marwan ELKREWI
Browse files

Update Heterozygosity_rare_male.md

parent 81ef4149
......@@ -53,260 +53,3 @@ srun vcftools --gzvcf head_asex_raremale.vcf.gz --remove-indels --maf 0.1 --max-
#filter 2:remove multiallelic
bcftools view --max-alleles 2 --exclude-types indels head_asex_raremale_cov5_filtered.vcf > head_asex_raremale_cov5_filtered2.vcf
```
create a simplified vcf:
```
cat head_asex_raremale_cov5_filtered2.vcf | grep -v '^##' | perl -pi -e 's/:.*?\t/\t/gi' | perl -pi -e 's/:.*//gi' | perl -pi -e 's/\t/ /gi' | perl -pi -e 's/0\/0/0/gi' | perl -pi -e 's/1\/1/2/gi' | perl -pi -e 's/0\/1/1/gi' | perl -pi -e 's/\.\/\./NA/gi' > head_asex_raremale_cov5_filtered2.vcf_simple
```
## 5. Find SNPs with the lost heterozygosity in the raremale genome
R code based on a simplified vcf file. It makes bins of 1 000 000 nucleotides along the Z chromosome. For each bin it calculates the number of SNPs that lost heterozygosity in the rare male genome in comparison with the female genome. The graph illustrates the fraction of such SNPs among all the heterozygous SNPs in the female genome per bin. (The output contains a horizontal line - an average level for the whole genome, it's caluclation is in another R script further)
```
knitr::opts_chunk$set(echo = T, message = F, eval = T, fig.show = "markup", include = T, warning = F, results = "markup", tidy = T)
#Package Loading
Packages = c("dplyr" , "tidyr", "data.table" , "readxl", "ggplot2", "plotly", "stringr");
invisible(lapply(Packages, library, character.only = TRUE))
#Empty workspace
rm(list=ls())
setwd("/Users/ukhaurat/Documents/full_assembly_analysis/")
data_to_read = list.files(pattern = "\\.vcf_simple$",
recursive = F,
full.names = F)
Sdata1 <- data.table::rbindlist(
sapply(data_to_read, fread, simplify = FALSE),
use.names = FALSE)
# Name change
Sdata <- Sdata1 %>%
rename(
Chromosome = 1,
FemaleState = 10,
MaleState = 11,
) %>%
mutate_at(
vars(Chromosome),
list(factor)
)
#subset of LG6=Z chromosome
SdataLG6<-subset(Sdata,Chromosome=="LG6")
#Binning
binwidth = 1000000
SdataLG6binned <- SdataLG6 %>%
mutate(
Bins = cut(POS,
breaks = seq(0, max(SdataLG6$POS) + binwidth, binwidth),
# Now we use include.lowest = T
include.lowest = TRUE,
# We exclude the right part of the bin to have the correct binwidth
right = FALSE,
# dig.lab defines how many digits are to be shown before it switches to scientific notation
dig.lab=10)
) %>%
droplevels()
# datatable if_else chaining
#(in the simplified VCF, 0 - homozygous, the same as the reference, 1 - heterozygous, 2 - homozygous, but different from the reference, we count only heterozygous sites)
data = SdataLG6binned %>% mutate(FHetstate = if_else(FemaleState == 1, "1",
if_else(FemaleState == 0, "0",
if_else(FemaleState == 2, "0",
"NA")))) %>%
mutate(MHetstate = if_else(MaleState == 1, "1",
if_else(MaleState == 0, "0",
if_else(MaleState == 2, "0",
"NA"))))
data$FHetstate <- as.numeric(data$FHetstate)
data$MHetstate <- as.numeric(data$MHetstate)
#calculate the number of heterozygous sites on the female and male Z chromosome, calculate its difference (hetdiff) and F/M ratio of heterosites (log2het)
data2 <- data %>%
group_by(Bins) %>%
summarize(
binnedFhetstate = sum(FHetstate),
binnedMhetstate = sum(MHetstate)
) %>%
mutate(hetdiff = binnedFhetstate-binnedMhetstate) %>%
mutate(log2het = log2(binnedFhetstate/binnedMhetstate))
#Categorization of SNPs
interpret = function(female, male){
state = rep(NA, length(female))
state[female == 1 & male == 1] = "HetMaintained"
state[(female == 1 & male == 0) | (female == 1 & male == 2)] = "LossOfHet"
state[(female == 0 & male == 1) | (female == 2 & male == 1)] = "GainOfHet"
state[(female == 0 & male == 2) | (female == 2 & male == 0)] = "SeqError"
return(state)
}
#Fraction of sites of each category
data3 = SdataLG6binned %>% mutate(state = if_else(FemaleState == 1 & MaleState == 1, "HetMaintained",
if_else(FemaleState == 1 & MaleState == 0, "LossOfHet",
if_else(FemaleState == 1 & MaleState == 2, "LossOfHet",
if_else(FemaleState == 0 & MaleState == 1, "GainOfHet",
if_else(FemaleState == 2 & MaleState == 1, "GainOfHet",
if_else(FemaleState == 0 & MaleState == 2, "SeqError",
if_else(FemaleState == 2 & MaleState == 0, "SeqError",
"NA")))))))
)
data1 = data3 %>% mutate(state = interpret(FemaleState, MaleState))
#graph - fraction of SNPs of each class
ggplot(data1, aes(x = state, fill = state)) + geom_bar()
#graph - fraction of SNPs of each class for each bin along the Z chromosome
ggplot(data1, aes(x = Bins, fill = state)) + geom_bar(position="fill") + xlab("1000000 bin on LG6") + ylab("Fraction of SNPs of each category")
#graph - in absolute numbers, removed NAs (was too high)
ggplot(data1 %>% filter(!is.na(Chromosome)), aes(x = Bins, fill = state)) + geom_bar() + xlab("LG number") + ylab("Number of SNPs of each category")
#preparation to the target graph "%SNPs lost heterozygosity"
data_new = data1 %>% mutate(LossOfHet = if_else(state == "LossOfHet", "1", "0"))
data_new$LossOfHet <- as.numeric(data_new$LossOfHet)
data_new1 <- data_new %>% select(Bins, LossOfHet) %>% group_by(Bins) %>% summarize(sumLoH = sum(LossOfHet))
#just absolute number of SNPs that lost heterozygosity
ggplot(data_new1, aes(x = Bins, y = sumLoH)) + geom_bar(stat = "identity", color="light blue", width = 0.3) + xlab("Bins of 1000000nt") + ylab("Number of SNPs that lost heterozygosity in the rare male")
#actual calculation of the fraction of SNPs that lost heterozygosity in the rare male.
data_newZ <- data_new1 %>% mutate(fraction = sumLoH*100/data2$binnedFhetstate)
#abline is the average fraction of SNPs that lost heterozygosity in the whole genome (in bins of the same size) except the Z chromosome (LG6) and except scaffolds that are not assigned to the chromosomes. It's calculation is in another similar script, which contains the same steps but for the whole genome.
#abline <- mean(data_new2$fraction)
#abline=13.77778
#Target graph, %SNPs lost heterozygosity
ggplot(data_newZ, aes(x = Bins, y = fraction)) + geom_bar(stat = "identity", color="light green", width = 0.3) + xlab("Bins of 1000000nt") + ylim(0, 80) + ylab("% SNPs that lost heterozygosity in the rare male") + geom_hline(yintercept = 13.777) + theme(axis.text.x=element_blank())
#a table with bins along the Z chromosome and the correspondent values for the number of SNPs that lost heterozygosity in the rre male (second column "SumLoH", and the fraction of these SNPs in the bin - the third column "fraction")
write.table(data_newZ, file = "data_newZorder.txt", append = FALSE, quote = FALSE, sep = " ", eol = "\n", na = "NA", dec = ".", row.names = TRUE, col.names = TRUE, qmethod = c("escape", "double"), fileEncoding = "")
```
R script to calculate a "baseline" the average fraction of SNPs that lost heterozygosity in the whole genome (in bins of the same size) except the Z chromosome (=LG6) and except scaffolds that are not assigned to the chromosomes.
The script contains the same steps as for the Z chromosome but for autosomes.
```
knitr::opts_chunk$set(echo = T, message = F, eval = T, fig.show = "markup", include = T, warning = F, results = "markup", tidy = T)
#Package Loading
Packages = c("dplyr" , "tidyr", "data.table" , "readxl", "ggplot2", "plotly", "stringr");
invisible(lapply(Packages, library, character.only = TRUE))
setwd("/Users/ukhaurat/Documents/full_assembly_analysis/")
data_to_read = list.files(pattern = "\\.vcf_simple$",
recursive = F,
full.names = F)
Sdata1 <- data.table::rbindlist(
sapply(data_to_read, fread, simplify = FALSE),
use.names = FALSE)
#Name change
Sdata <- Sdata1 %>%
rename(
Chromosome = 1,
FemaleState = 10,
MaleState = 11,
) %>%
mutate_at(
vars(Chromosome),
list(factor)
)
#subset (remove all the unassigned scaffolds and LG6=Z chromosome)
Sdata4 <- Sdata[!grepl("scaffold",Sdata$Chromosome),]
Sdata_autosomes <- subset(Sdata4, Chromosome != "LG6")
#Binning of the whole genome
binwidth = 1000000
Sdatabinned <- Sdata_autosomes %>%
group_by(Chromosome) %>%
arrange(Chromosome,POS) %>%
mutate(
Bins = cut(POS,
breaks = seq(0, max(Sdata_autosomes$POS) + binwidth, binwidth),
# Now we use include.lowest = T
include.lowest = TRUE,
# We exclude the right part of the bin to have the correct binwidth
right = FALSE,
# dig.lab defines how many digits are to be shown before it switches to scientific notation
dig.lab=10)
) %>%
droplevels()
#Categorization of SNPs
interpret = function(female, male){
state = rep(NA, length(female))
state[female == 1 & male == 1] = "HetMaintained"
state[(female == 1 & male == 0) | (female == 1 & male == 2)] = "LossOfHet"
state[(female == 0 & male == 1) | (female == 2 & male == 1)] = "GainOfHet"
state[(female == 0 & male == 2) | (female == 2 & male == 0)] = "SeqError"
return(state)
}
# datatable if_else chaining
data = Sdatabinned %>% mutate(state = if_else(FemaleState == 1 & MaleState == 1, "HetMaintained",
if_else(FemaleState == 1 & MaleState == 0, "LossOfHet",
if_else(FemaleState == 1 & MaleState == 2, "LossOfHet",
if_else(FemaleState == 0 & MaleState == 1, "GainOfHet",
if_else(FemaleState == 2 & MaleState == 1, "GainOfHet",
if_else(FemaleState == 0 & MaleState == 2, "SeqError",
if_else(FemaleState == 2 & MaleState == 0, "SeqError",
"NA")))))))
)
data1 = data %>% mutate(state = interpret(FemaleState, MaleState))
# datatable if_else chaining to count only heteozygous sites.
#(in the simplified VCF, 0 - homozygous, the same as the reference, 1 - heterozygous, 2 - homozygous, but different from the reference)
data3 = Sdatabinned %>% mutate(FHetstate = if_else(FemaleState == 1, "1",
if_else(FemaleState == 0, "0",
if_else(FemaleState == 2, "0",
"NA")))) %>%
mutate(MHetstate = if_else(MaleState == 1, "1",
if_else(MaleState == 0, "0",
if_else(MaleState == 2, "0",
"NA"))))
data3$FHetstate <- as.numeric(data3$FHetstate)
data3$MHetstate <- as.numeric(data3$MHetstate)
#number of heterozygous sites per bin (and its difference between the female and the male)
data2 <- data3 %>%
group_by(Bins) %>%
summarize(
binnedFhetstate = sum(FHetstate),
binnedMhetstate = sum(MHetstate)
) %>%
mutate(hetdiff = binnedFhetstate-binnedMhetstate) %>%
mutate(log2het = log2(binnedFhetstate/binnedMhetstate))
data_new = data1 %>% mutate(LossOfHet = if_else(state == "LossOfHet", "1", "0"))
data_new$LossOfHet <- as.numeric(data_new$LossOfHet)
data_new1 <- data_new %>% select(Bins, LossOfHet) %>% group_by(Bins) %>% summarize(sumLoH = sum(LossOfHet))
data_new2 <- data_new1 %>% mutate(fraction = sumLoH*100/data2$binnedFhetstate)
abline <- mean(data_new2$fraction)
#new_abline=13.77778
print (abline)
```
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment