Commit 5372697e authored by Uladzislava KHAURATOVICH's avatar Uladzislava KHAURATOVICH 💬
Browse files

Update Heterozygosity_rare_male.md

parent 54bc6c8e
......@@ -114,7 +114,7 @@ SdataLG6binned <- SdataLG6 %>%
droplevels()
# datatable if_else chaining
#(in the simplified VCF, 0 - homozygous, the same as the reference, 1 - heterozygous, 2 - homozygous, but different from the reference)
#(in the simplified VCF, 0 - homozygous, the same as the reference, 1 - heterozygous, 2 - homozygous, but different from the reference, we count only heterozygous sites)
data = SdataLG6binned %>% mutate(FHetstate = if_else(FemaleState == 1, "1",
if_else(FemaleState == 0, "0",
......@@ -140,7 +140,6 @@ data2 <- data %>%
mutate(log2het = log2(binnedFhetstate/binnedMhetstate))
#Categorization of SNPs
interpret = function(female, male){
state = rep(NA, length(female))
state[female == 1 & male == 1] = "HetMaintained"
......@@ -196,5 +195,117 @@ write.table(data_newZ, file = "data_newZorder.txt", append = FALSE, quote = FALS
```
R script to calculate a "baseline" the average fraction of SNPs that lost heterozygosity in the whole genome (in bins of the same size) except the Z chromosome (=LG6) and except scaffolds that are not assigned to the chromosomes.
The script contains the same steps as for the Z chromosome but for autosomes.
```
knitr::opts_chunk$set(echo = T, message = F, eval = T, fig.show = "markup", include = T, warning = F, results = "markup", tidy = T)
#Package Loading
Packages = c("dplyr" , "tidyr", "data.table" , "readxl", "ggplot2", "plotly", "stringr");
invisible(lapply(Packages, library, character.only = TRUE))
setwd("/Users/ukhaurat/Documents/full_assembly_analysis/")
data_to_read = list.files(pattern = "\\.vcf_simple$",
recursive = F,
full.names = F)
Sdata1 <- data.table::rbindlist(
sapply(data_to_read, fread, simplify = FALSE),
use.names = FALSE)
#Name change
Sdata <- Sdata1 %>%
rename(
Chromosome = 1,
FemaleState = 10,
MaleState = 11,
) %>%
mutate_at(
vars(Chromosome),
list(factor)
)
#subset (remove all the unassigned scaffolds and LG6=Z chromosome)
Sdata4 <- Sdata[!grepl("scaffold",Sdata$Chromosome),]
Sdata_autosomes <- subset(Sdata4, Chromosome != "LG6")
#Binning of the whole genome
binwidth = 1000000
Sdatabinned <- Sdata_autosomes %>%
group_by(Chromosome) %>%
arrange(Chromosome,POS) %>%
mutate(
Bins = cut(POS,
breaks = seq(0, max(Sdata_autosomes$POS) + binwidth, binwidth),
# Now we use include.lowest = T
include.lowest = TRUE,
# We exclude the right part of the bin to have the correct binwidth
right = FALSE,
# dig.lab defines how many digits are to be shown before it switches to scientific notation
dig.lab=10)
) %>%
droplevels()
#Categorization of SNPs
interpret = function(female, male){
state = rep(NA, length(female))
state[female == 1 & male == 1] = "HetMaintained"
state[(female == 1 & male == 0) | (female == 1 & male == 2)] = "LossOfHet"
state[(female == 0 & male == 1) | (female == 2 & male == 1)] = "GainOfHet"
state[(female == 0 & male == 2) | (female == 2 & male == 0)] = "SeqError"
return(state)
}
# datatable if_else chaining
data = Sdatabinned %>% mutate(state = if_else(FemaleState == 1 & MaleState == 1, "HetMaintained",
if_else(FemaleState == 1 & MaleState == 0, "LossOfHet",
if_else(FemaleState == 1 & MaleState == 2, "LossOfHet",
if_else(FemaleState == 0 & MaleState == 1, "GainOfHet",
if_else(FemaleState == 2 & MaleState == 1, "GainOfHet",
if_else(FemaleState == 0 & MaleState == 2, "SeqError",
if_else(FemaleState == 2 & MaleState == 0, "SeqError",
"NA")))))))
)
data1 = data %>% mutate(state = interpret(FemaleState, MaleState))
# datatable if_else chaining to count only heteozygous sites.
#(in the simplified VCF, 0 - homozygous, the same as the reference, 1 - heterozygous, 2 - homozygous, but different from the reference)
data3 = Sdatabinned %>% mutate(FHetstate = if_else(FemaleState == 1, "1",
if_else(FemaleState == 0, "0",
if_else(FemaleState == 2, "0",
"NA")))) %>%
mutate(MHetstate = if_else(MaleState == 1, "1",
if_else(MaleState == 0, "0",
if_else(MaleState == 2, "0",
"NA"))))
data3$FHetstate <- as.numeric(data3$FHetstate)
data3$MHetstate <- as.numeric(data3$MHetstate)
#number of heterozygous sites per bin (and its difference between the female and the male)
data2 <- data3 %>%
group_by(Bins) %>%
summarize(
binnedFhetstate = sum(FHetstate),
binnedMhetstate = sum(MHetstate)
) %>%
mutate(hetdiff = binnedFhetstate-binnedMhetstate) %>%
mutate(log2het = log2(binnedFhetstate/binnedMhetstate))
data_new = data1 %>% mutate(LossOfHet = if_else(state == "LossOfHet", "1", "0"))
data_new$LossOfHet <- as.numeric(data_new$LossOfHet)
data_new1 <- data_new %>% select(Bins, LossOfHet) %>% group_by(Bins) %>% summarize(sumLoH = sum(LossOfHet))
data_new2 <- data_new1 %>% mutate(fraction = sumLoH*100/data2$binnedFhetstate)
abline <- mean(data_new2$fraction)
#new_abline=13.77778
print (abline)
```
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment