Commit 26fbb1b4 authored by Uladzislava KHAURATOVICH's avatar Uladzislava KHAURATOVICH 💬
Browse files

Upload New File

parent 11e1c01d
---
author: "Uladzislava Khauratovich based on Julian's ans Saren's scripts"
date: <h4> 2 August 2021 </h4>
created: "2021-07-08"
updated: "2021-07-08"
output:
pdf_document:
highlight: default
always_allow_html: true
geometry: paperheight=6.5in,paperwidth=7in,margin=.5in
---
```{r Setup of Knitr & R, include = FALSE}
knitr::opts_chunk$set(echo = T, message = F, eval = T, fig.show = "markup", include = T, warning = F, results = "markup", tidy = T)
## Package Loading ##
Packages = c("dplyr" , "tidyr", "data.table" , "readxl", "ggplot2");
invisible(lapply(Packages, library, character.only = TRUE))
## Empty workspace ##
rm(list=ls())
```
### Julian's part
## Import Data
#It s just one file - we do not need the file name
```{r S Data Import I, message=F, warning=F}
setwd("D:/Heterozygosity_perfect_genome")
data_to_read = list.files(pattern = "\\.vcf_simple$",
recursive = F,
full.names = F)
Sdata1 <- data.table::rbindlist(
sapply(data_to_read, fread, simplify = FALSE),
use.names = FALSE)
```
### Name change
```{r}
Sdata <- Sdata1 %>%
rename(
Chromosome = 1,
FemaleState = 10,
MaleState = 11,
) %>%
mutate_at(
vars(Chromosome),
list(factor)
)
#Sdata all the data we have
```
```{r}
SdataChr13<-subset(Sdata,Chromosome=="Chromosome_13")
```
#### Binning
Adjusted binning
We will start from 0 as min, as the DNA sequence can not have "-" values.
```{r }
binwidth = 1000000
SBindataChr13 <- SdataChr13 %>%
arrange(POS) %>%
mutate(
Bins = cut(POS,
breaks = seq(0, max(SdataChr13$POS) + binwidth, binwidth),
# Now we use include.lowest = T
include.lowest = TRUE,
# We exclude the right part of the bin to have the correct binwidth
right = FALSE,
# dig.lab defines how many digits are to be shown before it switches to scientific notation
dig.lab=10)
) %>%
droplevels()
```
```{r }
# Custom function by Saren
interpret = function(female, male){
state = rep(NA, length(female))
state[female == 1 & male == 1] = "HetMaintained"
state[(female == 1 & male == 0) | (female == 1 & male == 2)] = "LossOfHet"
state[(female == 0 & male == 1) | (female == 2 & male == 1)] = "GainOfHet"
state[(female == 0 & male == 2) | (female == 2 & male == 0)] = "SeqError"
return(state)
}
# datatable if_else chaining
data = SBindataChr13 %>% mutate(state = if_else(FemaleState == 1 & MaleState == 1, "HetMaintained",
if_else(FemaleState == 1 & MaleState == 0, "LossOfHet",
if_else(FemaleState == 1 & MaleState == 2, "LossOfHet",
if_else(FemaleState == 0 & MaleState == 1, "GainOfHet",
if_else(FemaleState == 2 & MaleState == 1, "GainOfHet",
if_else(FemaleState == 0 & MaleState == 2, "SeqError",
if_else(FemaleState == 2 & MaleState == 0, "SeqError",
"NA")))))))
)
#assigned SNPs for Chr1
data1 = data %>% mutate(state = interpret(FemaleState, MaleState))
```
```{r }
data = SBindataChr13 %>% mutate(FHetstate = if_else(FemaleState == 1, "1",
if_else(FemaleState == 0, "0",
if_else(FemaleState == 2, "0",
"NA")))) %>%
mutate(MHetstate = if_else(MaleState == 1, "1",
if_else(MaleState == 0, "0",
if_else(MaleState == 2, "0",
"NA")))) %>%
mutate(allHetstate = if_else(MaleState == 1, "1",
if_else(MaleState == 0, "1",
if_else(MaleState == 2, "1",
"NA"))))
data$FHetstate <- as.numeric(data$FHetstate)
data$MHetstate <- as.numeric(data$MHetstate)
data$allHetstate <- as.numeric(data$allHetstate)
data2 <- data %>%
group_by(Bins) %>%
summarize(
binnedFhetstate = sum(FHetstate),
binnedMhetstate = sum(MHetstate),
binnedallhetstate = sum(allHetstate))
data_new2 <- data2 %>% mutate(fractionM = binnedMhetstate*100/data2$binnedallhetstate)
data_new2 <- data_new2 %>% mutate(fractionF = binnedFhetstate*100/data2$binnedallhetstate)
ggplot(data2, aes(x = Bins, y = binnedMhetstate)) + geom_bar(stat = "identity", color="green", width = 0.5) + xlab("1000000nt bins on Chr1") + ylab("number of heterosites in the raremale") + theme(axis.text.x=element_blank())
ggplot(data_new2, aes(x = Bins, y = fractionF)) + geom_bar(stat = "identity", color="red", width = 0.5) + xlab("1000000nt bins on Chr1") + ylab("fraction of heterosites in the female") + theme(axis.text.x=element_blank())
ggplot(data_new2, aes(x = Bins, y = fractionM)) + geom_bar(stat = "identity", color="lightgreen", width = 0.5) + xlab("1000000nt bins on Chr1") + ylab("fraction of heterosites in the raremale") + theme(axis.text.x=element_blank())
ggplot(data2, aes(x = Bins, y = binnedFhetstate)) + geom_bar(stat = "identity", color="pink", width = 0.5) + xlab("1000000nt bins on Chr1") + ylab("number of heterosites in the female") + theme(axis.text.x=element_blank())
```
```{r, fig.width=9,fig.height=5}
ggplot(data1, aes(x = state, fill = state)) + geom_bar()
ggplot(data1, aes(x = Bins, fill = state)) + geom_bar(position="fill") + xlab("1000000nt bins on Chr13") + ylab("Fraction of SNPs of each category")
ggplot(data1 %>% filter(!is.na(Chromosome)), aes(x = Bins, fill = state)) + geom_bar() + xlab("1000000nt bins on Chr13") + ylab("Number of SNPs of each category") # removed NAs to see absolute numbers
data_new = data1 %>% mutate(LossOfHet = if_else(state == "LossOfHet", "1", "0"))
data_new$LossOfHet <- as.numeric(data_new$LossOfHet)
data_new1 <- data_new %>% select(Bins, LossOfHet) %>% group_by(Bins) %>% summarize(sumLoH = sum(LossOfHet))
ggplot(data_new1, aes(x = Bins, y = sumLoH)) + geom_bar(stat = "identity", color="blue", width = 0.5) + xlab("Bins of 1000000nt") + ylab("Number of SNPs that lost heterozygosity in the raremale")
data_new2 <- data_new1 %>% mutate(fraction = sumLoH*100/data2$binnedFhetstate)
abline <- mean(data_new2$fraction)
#old_abline=19.39713
#new_abline=13.77778
#write.table(data_new2, file="/Users/ukhaurat/Documents/full_assembly_analysis/heterozygosity_bins_million_frac_nq.txt", append = FALSE, quote = F, sep = " ", eol = "\n", na = "NA", dec = ".", row.names = TRUE, col.names = TRUE)
ggplot(data_new2, aes(x = Bins, y = fraction)) + geom_bar(stat = "identity", color="light green", width = 0.3) + xlab("Chr 13 Bins of 1000000nt") + ylab("% SNPs that lost heterozygosity in the raremale") + geom_hline(yintercept = 13.77778) + theme(axis.text.x=element_blank())
```
```{r }
# Custom function by Saren
interpret = function(female, male){
state = rep(NA, length(female))
state[female == 1 & male == 1] = "HetMaintained"
state[(female == 1 & male == 0) | (female == 1 & male == 2)] = "LossOfHet"
state[(female == 0 & male == 1) | (female == 2 & male == 1)] = "GainOfHet"
state[(female == 0 & male == 2) | (female == 2 & male == 0)] = "SeqError"
return(state)
}
# datatable if_else chaining
data = SBindataChr1 %>% mutate(FHetstate = if_else(FemaleState == 1, "1",
if_else(FemaleState == 0, "0",
if_else(FemaleState == 2, "0",
"NA")))) %>%
mutate(MHetstate = if_else(MaleState == 1, "1",
if_else(MaleState == 0, "0",
if_else(MaleState == 2, "0",
"NA"))))
#data1 = data %>% mutate(state = interpret(FemaleState, MaleState))
data$FHetstate <- as.numeric(data$FHetstate)
data$MHetstate <- as.numeric(data$MHetstate)
data2 <- data %>%
group_by(Bins) %>%
summarize(
binnedFhetstate = sum(FHetstate),
binnedMhetstate = sum(MHetstate)
) %>%
mutate(hetdiff = binnedFhetstate-binnedMhetstate) %>%
mutate(log2het = log2(binnedFhetstate/binnedMhetstate))
dataDiff <- data2 %>% select(Bins, hetdiff)
dataLog2 <- data2 %>% select(Bins, log2het)
ggplot(dataDiff, aes(x = Bins, y = hetdiff)) + geom_bar(stat = "identity", color="blue", width = 0.5) + xlab("Bins of 1000000nt") + ylab("difference in number of heterozygous sites F-M")
ggplot(dataLog2, aes(x = Bins, y = log2het)) + geom_bar(stat = "identity", color="blue", width = 0.5) + xlab("Bins of 1000000nt") + ylab("Log2 F/M number of heterozygous sites")
```
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment