Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
I
IBD-Analysis
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Harald RINGBAUER
IBD-Analysis
Commits
4f8130d5
Commit
4f8130d5
authored
Sep 19, 2016
by
Harald RINGBAUER
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Upload new file
parent
f7e3a103
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
827 additions
and
0 deletions
+827
-0
IBD-Simulations/multi_runs.py
IBD-Simulations/multi_runs.py
+827
-0
No files found.
IBD-Simulations/multi_runs.py
0 → 100644
View file @
4f8130d5
'''
Created on Mar 19, 2015
Package similar to main, but with the purpose to do multiple runs for statistical analysis and save the data.
Parameters are taken from grid.py and mle_estim_error.py!
@author: Harald Ringbauer
'''
from
grid
import
factory_Grid
from
analysis
import
Analysis
,
torus_distance
from
random
import
shuffle
from
itertools
import
combinations
from
bisect
import
bisect_right
from
math
import
pi
from
scipy.special
import
kv
as
kv
# Import Bessel functions of second kind
import
cPickle
as
pickle
import
numpy
as
np
import
matplotlib.pyplot
as
plt
t
=
200
# Generation time for a single run #t=200
nr_runs
=
20
# How many runs
sample_sizes
=
(
100
,
270
,
440
,
625
)
distances
=
[[
2
,
10
],
[
10
,
20
],
[
20
,
30
],
[
30
,
40
],
[
40
,
50
],
[
50
,
60
]]
# Distances to use for binning
# distances = [[1, 5], [5, 10], [10, 15], [15, 20], [20, 25], [25, 30]] # Distances to use for binning
intervals
=
[[
4
,
5
],
[
5
,
6.5
],
[
6.5
,
8
],
[
8
,
12
]]
# Bins for the block length binning
def
single_run
():
''' Do a single run, parameters are saved in grid'''
grid
=
factory_Grid
()
# Set grid
grid
.
reset_grid
()
# Delete everything
grid
.
set_samples
()
grid
.
update_t
(
t
)
# Do the actual run
data
=
Analysis
(
grid
)
# Do Data-Analysis
data
.
fit_expdecay
(
show
=
False
)
sigma
=
data
.
sigma_estimate
block_nr
=
len
(
grid
.
IBD_blocks
)
return
(
sigma
,
block_nr
)
def
analysis_run
():
grid
=
factory_Grid
()
parameters
=
(
grid
.
sigma
,
grid
.
gridsize
,
grid
.
sample_steps
,
grid
.
dispmode
)
results
=
np
.
zeros
((
nr_runs
,
2
))
# Container for the data
'''Runs the statistical analysis'''
for
i
in
range
(
0
,
nr_runs
):
print
(
"Doing run: %.1f"
%
i
)
results
[
i
,
:]
=
single_run
()
# Do the run and save the results
print
(
"RUN COMPLETE!!"
)
pickle
.
dump
((
results
,
parameters
),
open
(
"Data1/stats_demes.p"
,
"wb"
))
# Pickle the data
# save_name=raw_input("Save to what filename?")
# pickle.dump((results, parameters), open(save_name, "wb")) # Pickle the data
print
(
"SAVED"
)
def
run_var_samp
(
save_name
):
'''Runs simulations for various sample sizes and saves estimates and parameters.'''
grid
=
factory_Grid
()
results
=
np
.
zeros
((
len
(
sample_sizes
),
nr_runs
,
2
))
# Container for the data
sample_steps
=
grid
.
sample_steps
'''Actual runs:'''
row
=
0
for
k
in
sample_sizes
:
position_list
=
[(
i
+
sample_steps
/
2
,
j
+
sample_steps
/
2
,
0
)
for
i
in
range
(
0
,
grid
.
gridsize
,
sample_steps
)
for
j
in
range
(
0
,
grid
.
gridsize
,
sample_steps
)]
# position_list = [(i + sample_steps / 2, j + sample_steps / 2, 0) for i in range(0, sample_steps * k, sample_steps) for j in range(0, sample_steps * k, sample_steps)]
for
i
in
range
(
0
,
nr_runs
):
print
(
"Doing run: %.1f for %.0f samples"
%
(
i
,
k
))
grid
.
reset_grid
()
# Delete everything
shuffle
(
position_list
)
# Randomize position List
print
(
position_list
)
grid
.
set_chromosome
(
position_list
[:
k
])
# Set the samples
grid
.
update_t
(
t
)
# Do the actual run
if
grid
.
dispmode
==
"demes"
:
grid
.
update_IBD_blocks_demes
(
5
)
# Update position for deme analysis!!
data
=
Analysis
(
grid
)
# Do Data-Analysis
data
.
fit_expdecay
(
show
=
False
)
sigma
=
data
.
sigma_estimate
block_nr
=
len
(
grid
.
IBD_blocks
)
results
[
row
,
i
,
:]
=
(
sigma
,
block_nr
)
row
+=
1
# Go one down in the results_row
print
(
"RUN COMPLETE!!"
)
parameters
=
(
grid
.
sigma
,
grid
.
gridsize
,
sample_sizes
,
grid
.
dispmode
)
pickle
.
dump
((
results
,
parameters
),
open
(
save_name
,
"wb"
))
# Pickle the data
print
(
"SAVED"
)
def
empirical_IBD_list
(
save_name
):
'''Generate empirical IBD-list. Nr. of run times'''
results
=
[]
# Container for the data
'''Actual runs:'''
for
i
in
range
(
nr_runs
):
print
(
"Doing run: %i"
%
i
)
grid
=
factory_Grid
(
growing
=
1
)
# No growing grid
grid
.
reset_grid
()
grid
.
set_samples
()
grid
.
update_t
(
t
)
# Do the actual run
# if grid.dispmode == "demes":
# grid.update_IBD_blocks_demes(5) # Update position for deme analysis!!
pair_dist
,
pair_IBD
,
pair_nr
=
grid
.
give_lin_IBD
(
bin_pairs
=
True
)
# Get the binned IBD-lists
results
.
append
([
pair_dist
,
pair_IBD
,
pair_nr
])
# Save the empirical data
parameters
=
(
grid
.
sigma
,
grid
.
gridsize
,
grid
.
sample_steps
,
grid
.
dispmode
)
pickle
.
dump
((
results
,
parameters
),
open
(
save_name
,
"wb"
))
# Pickle the data
print
(
"SAVED"
)
def
get_normalization_factor
(
dist_bins
,
grid_size
,
sample_steps
):
'''Calculates Normalization factor for binned distances and starting grid'''
position_list
=
[(
i
+
sample_steps
/
2
,
j
+
sample_steps
/
2
,
0
)
for
i
in
range
(
0
,
grid_size
,
sample_steps
)
for
j
in
range
(
0
,
grid_size
,
sample_steps
)]
k
=
len
(
position_list
)
print
(
k
*
(
k
-
1
)
/
2
)
distance_bins
=
np
.
zeros
(
len
(
dist_bins
)
-
1
)
# Create bins for every element in List; len(bins)=len(counts)+1
dist_bins
[
-
1
]
+=
0.000001
# Hack to make sure that the distance exactly matching the max are counted
# Calculate Distance for every possible pair to get proper normalization factor:
for
(
x
,
y
)
in
combinations
(
np
.
arange
(
len
(
position_list
)),
r
=
2
):
dist
=
torus_distance
(
position_list
[
x
][
0
],
position_list
[
x
][
1
],
position_list
[
y
][
0
],
position_list
[
y
][
1
],
grid_size
)
j
=
bisect_right
(
dist_bins
,
dist
)
if
j
<
len
(
dist_bins
)
and
j
>
0
:
# So it actually falls into somewhere
distance_bins
[
j
-
1
]
+=
1
return
distance_bins
def
get_normalization_lindata
(
dist_bins
,
pair_dist
,
pair_nr
):
'''Gets the Nr. of pairs in each distance bin from linearized IBD-data'''
bl_nr
=
np
.
zeros
(
len
(
dist_bins
))
# Creates zero array
for
i
in
range
(
len
(
dist_bins
)):
dist
=
dist_bins
[
i
]
bl_nr
[
i
]
+=
np
.
sum
([
pair_nr
[
j
]
for
j
in
range
(
len
(
pair_dist
))
if
dist
[
0
]
<=
pair_dist
[
j
]
<
dist
[
1
]])
return
bl_nr
def
into_bins
(
pair_dist
,
pair_IBD
,
intervals
,
distances
):
'''Return Matrix of blocks Nr in each interval and distance bin.'''
res
=
np
.
zeros
((
len
(
intervals
),
len
(
distances
))).
astype
(
np
.
int
)
# Create empty Int-Array.
for
i
in
range
(
len
(
intervals
)):
for
j
in
range
(
len
(
distances
)):
intv
=
intervals
[
i
]
dist
=
distances
[
j
]
for
d
in
range
(
len
(
pair_dist
)):
# Iterate over every pair
if
dist
[
0
]
<=
pair_dist
[
d
]
<
dist
[
1
]:
res
[
i
,
j
]
+=
np
.
sum
([
1
for
block
in
pair_IBD
[
d
]
if
(
intv
[
0
]
<=
block
<=
intv
[
1
])])
return
np
.
array
(
res
)
# Return numpy array
def
bessel_longer_l0
(
sigma
,
r
,
l0
,
b
,
D
):
'''The formula for block sharing longer than l0. Measured in cM
If r vector return vector. b is growth rate parameter'''
l0
=
l0
/
100.0
# Change to Morgan
G
=
1.5
# Chromosome-Length
C
=
G
/
(
D
*
pi
*
sigma
**
2
)
# First calculate the constant; D_e=1
y
=
C
*
2
**
((
-
5
-
3
*
b
)
/
2.0
)
*
(
r
/
(
np
.
sqrt
(
l0
)
*
sigma
))
**
(
1
+
b
)
*
kv
(
1
+
b
,
np
.
sqrt
(
2.0
*
l0
)
*
r
/
sigma
)
# y = C * r / (sigma * np.sqrt(2 * l0)) * kv(1, np.sqrt(2.0 * l0) * r / sigma)
return
y
def
bessel_l
(
sigma
,
r
,
l
,
b
,
D
):
'''Plots the exact Besseldecay for a block of length l. If r vector return vector'''
l
=
l
/
100.0
# Change to Morgan
G
=
1.5
# Chromosome-Length
C
=
G
/
(
D
*
pi
*
sigma
**
2
)
# First calculate the constant; D_e=1
y
=
C
*
2
**
(
-
3
-
3
*
b
/
2.0
)
*
(
r
/
(
np
.
sqrt
(
l
)
*
sigma
))
**
(
2
+
b
)
*
kv
(
2
+
b
,
np
.
sqrt
(
2.0
*
l
)
*
r
/
sigma
)
# y = C * r / (sigma * np.sqrt(2 * l0)) * kv(1, np.sqrt(2.0 * l0) * r / sigma)
return
(
y
/
100.0
)
# Return density in cM
def
get_theory_sharing
(
intervals
,
distances
,
sigma
,
b
,
D
):
'''Gives back the theory sharing for the given Distance- and
block length intervals'''
res
=
np
.
zeros
((
len
(
intervals
),
len
(
distances
)))
# Array for results
for
i
in
range
(
len
(
intervals
)):
l
=
intervals
[
i
]
int_len
=
l
[
1
]
-
l
[
0
]
for
j
in
range
(
len
(
distances
)):
d1
=
distances
[
j
]
dists
=
np
.
linspace
(
d1
[
0
],
d1
[
1
],
20
)
# 20 Intervals for calculation
shr_bin
=
bessel_longer_l0
(
sigma
,
dists
,
l
[
0
],
b
,
D
)
-
bessel_longer_l0
(
sigma
,
dists
,
l
[
1
],
b
,
D
)
# Get the sharing per length bin
res
[
i
,
j
]
=
np
.
mean
(
shr_bin
)
/
int_len
# Mean sharing per cM averaged over dist bins
return
res
def
analyze_emp_IBD_list
(
save_name
,
show
=
True
,
b
=
0
,
D
=
1
):
'''Plots summary of the empirical-IBD-list'''
(
results
,
parameters
)
=
pickle
.
load
(
open
(
save_name
,
"rb"
))
# Data2/file.p
print
(
parameters
)
print
(
len
(
results
))
sigma
,
_
,
_
,
_
=
parameters
dist_means
=
np
.
array
([
np
.
mean
(
i
)
for
i
in
distances
])
# Mean distances
bl_nr
=
np
.
zeros
((
len
(
results
),
len
(
intervals
),
len
(
distances
))).
astype
(
np
.
int
)
# Container for block matrices
for
i
in
range
(
len
(
results
)):
pair_dist
,
pair_IBD
,
pair_nr
=
results
[
i
]
# Unpack the result arrays.
# bl_list = results[i]
# Generate Block-Info
# block_info = [[b[1], torus_distance(b[2][0], b[2][1], b[3][0], b[3][1], grid_size)] for b in bl_list]
bl_nr
[
i
,
:,
:]
=
into_bins
(
pair_dist
,
pair_IBD
,
intervals
,
distances
)
# Get the empirical data matrix
# print(pair_dist)
# print(pair_nr)
nr_pairs_bins
=
get_normalization_lindata
(
distances
,
pair_dist
,
pair_nr
)
# Get Nr. factor: Pairs of inds
int_len
=
np
.
array
([
i
[
1
]
-
i
[
0
]
for
i
in
intervals
])
# Length of the intervals - to normalize for that
mean_shr
=
np
.
mean
(
bl_nr
,
axis
=
0
)
sts
=
np
.
std
(
bl_nr
,
axis
=
0
)
emp_shr
=
mean_shr
/
nr_pairs_bins
emp_shr
=
emp_shr
/
int_len
[:,
None
]
emp_sts
=
sts
/
nr_pairs_bins
emp_sts
=
emp_sts
/
int_len
[:,
None
]
# print(emp_shr)
thr_shr
=
get_theory_sharing
(
intervals
,
distances
,
sigma
,
b
,
D
)
# Get predicted sharing
# print(thr_shr)
'''Now do the plotting'''
if
show
==
True
:
f
,
axarr
=
plt
.
subplots
(
2
,
2
,
sharex
=
True
,
sharey
=
True
)
# Create sub-plots
for
i
in
range
(
4
):
# Loop through interval list
curr_plot
=
axarr
[
i
/
2
,
i
%
2
]
# Set current plot
curr_plot
.
set_yscale
(
'log'
)
# Set Log-Scale
interval
=
intervals
[
i
]
# int_len = interval[1] - interval[0] # Calculate the length of an interval
curr_plot
.
set_ylim
([
1.0
/
10
**
6
,
1.0
/
10
])
curr_plot
.
set_xlim
([
0
,
70
])
l2
,
=
curr_plot
.
semilogy
(
dist_means
,
thr_shr
[
i
,
:],
'r-.'
,
linewidth
=
2
)
# Plot of exact fit
l1
=
curr_plot
.
errorbar
(
dist_means
,
emp_shr
[
i
,
:],
yerr
=
emp_sts
[
i
,
:],
fmt
=
'bo'
)
# curr_plot.set_ylim([min(y) / 3, max(y) * 3])
curr_plot
.
set_title
(
"Interval: "
+
str
(
interval
)
+
" cM"
)
# curr_plot.annotate("Blocks: %.0f" % self.total_bl_nr, xy=(0.4, 0.7), xycoords='axes fraction', fontsize=16)
f
.
text
(
0.5
,
0.02
,
r'Distance [$\sigma$]'
,
ha
=
'center'
,
va
=
'center'
,
fontsize
=
20
)
f
.
text
(
0.025
,
0.5
,
'IBD-blocks per pair and cM'
,
ha
=
'center'
,
va
=
'center'
,
rotation
=
'vertical'
,
fontsize
=
20
)
f
.
legend
((
l1
,
l2
),
(
'Simulated IBD-sharing'
,
'Theory'
),
loc
=
(
0.7
,
0.36
))
plt
.
tight_layout
()
plt
.
show
()
dist_means
=
dist_means
/
sigma
# Normalize The distance to Dispersal units
return
((
dist_means
,
thr_shr
,
emp_shr
,
emp_sts
))
def
analyze_mult_emp_lists
(
save_names
):
emp_shr
=
[]
emp_sts
=
[]
for
name
in
save_names
:
# Load the data
dist_means
,
thr_shr
,
shr
,
sts
=
analyze_emp_IBD_list
(
name
,
show
=
False
)
emp_shr
.
append
(
shr
)
emp_sts
.
append
(
sts
)
# Do the actual plot: (Multiple Windows)
# f, axarr = plt.subplots(2, 2, sharex=True, sharey=True) # Create sub-plots
# for i in range(4): # Loop through interval list
# curr_plot = axarr[i / 2, i % 2] # Set current plot
# curr_plot.set_yscale('log') # Set Log-Scale
# interval = intervals[i]
# curr_plot.set_ylim([1.0 / 10 ** 6, 1.0 / 10])
# curr_plot.set_xlim([0, 30])
#
# l0, = curr_plot.semilogy(dist_means, thr_shr[i, :], 'r-', linewidth=2) # Plot of exact fit
# l1 = curr_plot.errorbar(dist_means - 1, emp_shr[0][i, :], yerr=emp_sts[0][i, :], fmt='yo', linewidth=2)
# l2 = curr_plot.errorbar(dist_means - 0.5, emp_shr[1][i, :], yerr=emp_sts[1][i, :], fmt='go', linewidth=2)
# l3 = curr_plot.errorbar(dist_means, emp_shr[2][i, :], yerr=emp_sts[2][i, :], fmt='ko', linewidth=2)
# l4 = curr_plot.errorbar(dist_means + 0.5, emp_shr[3][i, :], yerr=emp_sts[3][i, :], fmt='mo', linewidth=2)
# l5 = curr_plot.errorbar(dist_means + 1, emp_shr[4][i, :], yerr=emp_sts[4][i, :], fmt="co", linewidth=2)
# # curr_plot.set_ylim([min(y) / 3, max(y) * 3])
# curr_plot.set_title("Interval: " + str(interval) + " cM", fontsize=20)
# curr_plot.tick_params(axis='x', labelsize=20)
# curr_plot.tick_params(axis='y', labelsize=20)
#
# # curr_plot.annotate("Blocks: %.0f" % self.total_bl_nr, xy=(0.4, 0.7), xycoords='axes fraction', fontsize=16)
# f.text(0.5, 0.02, r'Distance [$\sigma$]', ha='center', va='center', fontsize=28)
# f.text(0.025, 0.5, 'IBD-blocks per pair and cM', ha='center', va='center', rotation='vertical', fontsize=28)
# f.legend((l0, l1, l2, l3, l4, l5), ('Theory', 'DiscSim', 'Laplace', 'Normal', 'Uniform', 'Demes'), loc=(0.83, 0.23))
# plt.tight_layout()
# plt.show()
plt
.
figure
()
plt
.
yscale
(
'log'
)
# Set Log-Scale)
plt
.
ylim
([
1.0
/
10
**
6
,
1.0
/
10
])
plt
.
xlim
([
0
,
30
])
styles
=
[
'r-'
,
'r--'
,
'r-.'
,
'r:'
]
for
i
in
range
(
len
(
intervals
)):
interval
=
intervals
[
i
]
plt
.
semilogy
(
dist_means
,
thr_shr
[
i
,
:],
styles
[
i
],
linewidth
=
3
,
label
=
str
(
interval
)
+
" cM"
)
# Plot of exact fit
l1
=
plt
.
errorbar
(
dist_means
-
1
,
emp_shr
[
0
][
i
,
:],
yerr
=
emp_sts
[
0
][
i
,
:],
fmt
=
'yo'
,
linewidth
=
3
)
l2
=
plt
.
errorbar
(
dist_means
-
0.5
,
emp_shr
[
1
][
i
,
:],
yerr
=
emp_sts
[
1
][
i
,
:],
fmt
=
'go'
,
linewidth
=
3
)
l3
=
plt
.
errorbar
(
dist_means
,
emp_shr
[
2
][
i
,
:],
yerr
=
emp_sts
[
2
][
i
,
:],
fmt
=
'ko'
,
linewidth
=
3
)
l4
=
plt
.
errorbar
(
dist_means
+
0.5
,
emp_shr
[
3
][
i
,
:],
yerr
=
emp_sts
[
3
][
i
,
:],
fmt
=
'mo'
,
linewidth
=
3
)
l5
=
plt
.
errorbar
(
dist_means
+
1
,
emp_shr
[
4
][
i
,
:],
yerr
=
emp_sts
[
4
][
i
,
:],
fmt
=
"co"
,
linewidth
=
3
)
# curr_plot.set_ylim([min(y) / 3, max(y) * 3])
plt
.
tick_params
(
axis
=
'x'
,
labelsize
=
20
)
plt
.
tick_params
(
axis
=
'y'
,
labelsize
=
20
)
# curr_plot.annotate("Blocks: %.0f" % self.total_bl_nr, xy=(0.4, 0.7), xycoords='axes fraction', fontsize=16)
plt
.
xlabel
(
r'Distance [$\sigma$]'
,
fontsize
=
28
)
plt
.
ylabel
(
'IBD-blocks per pair and cM'
,
fontsize
=
28
)
f1
=
plt
.
legend
((
l1
,
l2
,
l3
,
l4
,
l5
),
(
'DiscSim'
,
'Laplace'
,
'Normal'
,
'Uniform'
,
'Demes'
),
loc
=
(
0.75
,
0.67
))
# Create a legend for the first line.
plt
.
gca
().
add_artist
(
f1
)
# Add the legend manually to the current Axes.
plt
.
legend
(
loc
=
(
0.1
,
0.1
))
plt
.
tight_layout
()
plt
.
show
()
def
analyze_var_density
():
'''Analyze varying density data_sets'''
b
=
[
1
,
-
1
,
0
]
# Growth rate parameters
D
=
[
200
,
1
,
10
]
save_names
=
[
"growing20.p"
,
"declining20.p"
,
"const20.p"
]
# distances = [[2, 10], [10, 20], [20, 30], [30, 40], [40, 50], [50, 100]] # Distances to use for binning
# dists = np.linspace(1, 80, 200) # x-Values for Theory
emp_shr
=
[]
emp_sts
=
[]
thr_shrs
=
[]
for
i
in
range
(
len
(
save_names
)):
# Load the data
name
=
save_names
[
i
]
dist_means
,
thr_shr
,
shr
,
sts
=
analyze_emp_IBD_list
(
name
,
show
=
False
,
b
=
b
[
i
],
D
=
D
[
i
])
thr_shrs
.
append
(
thr_shr
)
emp_shr
.
append
(
shr
)
emp_sts
.
append
(
sts
)
# Do the actual plot:
f
,
axarr
=
plt
.
subplots
(
2
,
2
,
sharex
=
True
,
sharey
=
True
)
# Create sub-plots
for
i
in
range
(
4
):
# Loop through interval list
curr_plot
=
axarr
[
i
/
2
,
i
%
2
]
# Set current plot
curr_plot
.
set_yscale
(
'log'
)
# Set Log-Scale
interval
=
intervals
[
i
]
curr_plot
.
set_ylim
([
1.0
/
10
**
7
,
1.0
/
10
])
curr_plot
.
set_xlim
([
0
,
30
])
l0
,
=
curr_plot
.
semilogy
(
dist_means
,
thr_shrs
[
0
][
i
,
:],
'r-'
,
linewidth
=
2
)
# Plot of exact fit
curr_plot
.
errorbar
(
dist_means
,
emp_shr
[
0
][
i
,
:],
yerr
=
emp_sts
[
0
][
i
,
:],
fmt
=
'ro'
)
# l2 = curr_plot.semilogy(dists, bessel_l(1.0, dists, np.sqrt(interval[0]*interval[1]), 1, 200), 'b-')
# l2 = curr_plot.errorbar(dist_means - 0.3, emp_shr[1][i, :], yerr=emp_sts[1][i, :], fmt='go')
l1
,
=
curr_plot
.
semilogy
(
dist_means
,
thr_shrs
[
1
][
i
,
:],
'b-'
,
linewidth
=
2
)
# Plot of exact fit
curr_plot
.
errorbar
(
dist_means
,
emp_shr
[
1
][
i
,
:],
yerr
=
emp_sts
[
1
][
i
,
:],
fmt
=
'bo'
)
l2
,
=
curr_plot
.
semilogy
(
dist_means
,
thr_shrs
[
2
][
i
,
:],
'm-'
,
linewidth
=
2
)
# Plot of exact fit
curr_plot
.
errorbar
(
dist_means
,
emp_shr
[
2
][
i
,
:],
yerr
=
emp_sts
[
2
][
i
,
:],
fmt
=
'mo'
)
curr_plot
.
set_title
(
"Interval: "
+
str
(
interval
)
+
" cM"
,
fontsize
=
20
)
curr_plot
.
tick_params
(
axis
=
'x'
,
labelsize
=
20
)
curr_plot
.
tick_params
(
axis
=
'y'
,
labelsize
=
20
)
# curr_plot.annotate("Blocks: %.0f" % self.total_bl_nr, xy=(0.4, 0.7), xycoords='axes fraction', fontsize=16)
f
.
text
(
0.5
,
0.02
,
r'Distance [$\sigma$]'
,
ha
=
'center'
,
va
=
'center'
,
fontsize
=
28
)
f
.
text
(
0.025
,
0.5
,
'IBD-blocks per pair and cM'
,
ha
=
'center'
,
va
=
'center'
,
rotation
=
'vertical'
,
fontsize
=
28
)
f
.
legend
((
l0
,
l2
,
l1
),
(
'Growing'
,
'Constant'
,
'Declining'
),
loc
=
(
0.8
,
0.35
))
plt
.
tight_layout
()
plt
.
show
()
def
analyze_var_samp
():
'''Analyze saved results in Data2 produced by run_var_samp'''
(
results_l
,
parameters_l
)
=
pickle
.
load
(
open
(
"Data2/laplace1.p"
,
"rb"
))
(
results_n
,
parameters_n
)
=
pickle
.
load
(
open
(
"Data2/normal1.p"
,
"rb"
))
# @UnusedVariable
(
results_u
,
parameters_u
)
=
pickle
.
load
(
open
(
"Data2/uniform1.p"
,
"rb"
))
# @UnusedVariable
(
results_d
,
parameters_d
)
=
pickle
.
load
(
open
(
"Data2/demes3.p"
,
"rb"
))
# @UnusedVariable
(
results_c
,
parameters_c
)
=
pickle
.
load
(
open
(
"Data2/discsim2.p"
,
"rb"
))
# @UnusedVariable
(
results_l1
,
_
)
=
pickle
.
load
(
open
(
"Data2/laplace.p"
,
"rb"
))
(
results_n1
,
_
)
=
pickle
.
load
(
open
(
"Data2/normal.p"
,
"rb"
))
# @UnusedVariable
(
results_u1
,
_
)
=
pickle
.
load
(
open
(
"Data2/uniform.p"
,
"rb"
))
# @UnusedVariable
(
results_d1
,
_
)
=
pickle
.
load
(
open
(
"Data2/demes.p"
,
"rb"
))
# @UnusedVariable
(
results_c1
,
_
)
=
pickle
.
load
(
open
(
"Data2/DISCSIM.p"
,
"rb"
))
# @UnusedVariable
sample_sizes
=
parameters_l
[
2
]
sigma_estimates_l
=
np
.
concatenate
((
results_l
[:,
:,
0
],
results_l1
[:,
:,
0
]),
axis
=
1
)
sigma_estimates_n
=
np
.
concatenate
((
results_n
[:,
:,
0
],
results_n1
[:,
:,
0
]),
axis
=
1
)
sigma_estimates_u
=
np
.
concatenate
((
results_u
[:,
:,
0
],
results_u1
[:,
:,
0
]),
axis
=
1
)
sigma_estimates_d
=
results_d
[:,
:,
0
]
sigma_estimates_c
=
np
.
concatenate
((
results_c
[:,
:,
0
],
results_c1
[:,
:,
0
]),
axis
=
1
)
mean_sigma_est_l
=
np
.
mean
(
sigma_estimates_l
,
1
)
mean_sigma_est_n
=
np
.
mean
(
sigma_estimates_n
,
1
)
mean_sigma_est_u
=
np
.
mean
(
sigma_estimates_u
,
1
)
mean_sigma_est_d
=
np
.
mean
(
sigma_estimates_d
,
1
)
mean_sigma_est_c
=
np
.
mean
(
sigma_estimates_c
,
1
)
print
(
"Sample size: %.1f"
%
np
.
size
(
sigma_estimates_u
,
1
))
std_sigma_est_l
=
np
.
std
(
sigma_estimates_l
,
1
)
std_sigma_est_n
=
np
.
std
(
sigma_estimates_n
,
1
)
std_sigma_est_u
=
np
.
std
(
sigma_estimates_u
,
1
)
std_sigma_est_d
=
np
.
std
(
sigma_estimates_d
,
1
)
std_sigma_est_c
=
np
.
std
(
sigma_estimates_c
,
1
)
plt
.
figure
()
plt
.
errorbar
(
np
.
array
(
sample_sizes
)
-
10
,
mean_sigma_est_d
,
yerr
=
std_sigma_est_d
,
fmt
=
'ko'
,
label
=
"Demes"
,
linewidth
=
2
)
plt
.
errorbar
(
np
.
array
(
sample_sizes
)
-
5
,
mean_sigma_est_u
,
yerr
=
std_sigma_est_u
,
fmt
=
'bo'
,
label
=
"Uniform"
,
linewidth
=
2
)
plt
.
errorbar
(
sample_sizes
,
mean_sigma_est_l
,
yerr
=
std_sigma_est_l
,
fmt
=
'mo'
,
label
=
"Laplace"
,
linewidth
=
2
)
plt
.
errorbar
(
np
.
array
(
sample_sizes
)
+
5
,
mean_sigma_est_n
,
yerr
=
std_sigma_est_n
,
fmt
=
'ro'
,
label
=
"Normal"
,
linewidth
=
2
)
plt
.
errorbar
(
np
.
array
(
sample_sizes
)
+
10
,
mean_sigma_est_c
,
yerr
=
std_sigma_est_c
,
fmt
=
'yo'
,
label
=
"DiscSim"
,
linewidth
=
2
)
plt
.
xlabel
(
"Nr of samples"
,
fontsize
=
20
)
plt
.
ylabel
(
r"$\mathbf{\bar{\sigma}}$"
,
fontsize
=
30
)
plt
.
tick_params
(
axis
=
'x'
,
labelsize
=
15
)
plt
.
tick_params
(
axis
=
'y'
,
labelsize
=
15
)
plt
.
axhline
(
2
,
color
=
'g'
,
label
=
"True Value"
,
linewidth
=
2
)
plt
.
legend
()
plt
.
show
()
def
analyze_stats
():
load_u
=
pickle
.
load
(
open
(
"Data1/stats_uniform.p"
,
"rb"
))[
0
][:,
0
]
load_n
=
pickle
.
load
(
open
(
"Data1/stats_normal.p"
,
"rb"
))[
0
][:,
0
]
load_l
=
pickle
.
load
(
open
(
"Data1/stats_laplace.p"
,
"rb"
))[
0
][:,
0
]
load_d
=
pickle
.
load
(
open
(
"Data1/stats_demes.p"
,
"rb"
))[
0
][:,
0
]
load_c
=
pickle
.
load
(
open
(
"./Data1/disc_cstats.p"
,
"rb"
))[
0
][:,
0
]
/
1000.0
val_u
=
1.987
val_n
=
2.02
val_l
=
2.02
val_d
=
2
val_c
=
2
# plt.figure()
# plt.hist(load_c, bins=20, alpha=0.5)
# plt.axvline(2, color='r', linestyle='dashed', linewidth=3)
# plt.xlabel("Estimation")
# plt.ylabel("Number")
# plt.show()
print
(
load_c
.
mean
())
print
(
"Rel. Bias %.6f"
%
(
load_c
.
mean
()
/
val_c
-
1
))
print
(
"CV %.4f"
%
(
load_c
.
std
()
/
load_c
.
mean
()))
# Plot Stuff
f
,
axarr
=
plt
.
subplots
(
3
,
2
,
sharey
=
True
,
figsize
=
(
5
,
10
))
# axarr.set_title("Estimated Dispersal rate")
axarr
[
0
,
0
].
hist
(
load_u
*
2
/
val_u
,
bins
=
20
,
alpha
=
0.5
)
axarr
[
0
,
0
].
set_title
(
'Uniform'
)
axarr
[
0
,
0
].
axvline
(
2
,
color
=
'r'
,
linestyle
=
'dashed'
,
linewidth
=
3
)
axarr
[
0
,
0
].
axvline
(
np
.
mean
(
load_u
),
color
=
'g'
,
linestyle
=
'dashed'
,
linewidth
=
3
)
axarr
[
0
,
1
].
hist
(
load_n
*
2
/
val_n
,
bins
=
20
,
alpha
=
0.5
)
axarr
[
0
,
1
].
set_title
(
'Normal'
)
axarr
[
0
,
1
].
axvline
(
2
,
color
=
'r'
,
linestyle
=
'dashed'
,
linewidth
=
3
)
axarr
[
0
,
1
].
axvline
(
np
.
mean
(
load_n
),
color
=
'g'
,
linestyle
=
'dashed'
,
linewidth
=
3
)
axarr
[
1
,
0
].
hist
(
load_l
*
2
/
val_l
,
bins
=
20
,
alpha
=
0.5
)
axarr
[
1
,
0
].
set_title
(
'Laplace'
)
axarr
[
1
,
0
].
axvline
(
2
,
color
=
'r'
,
linestyle
=
'dashed'
,
linewidth
=
3
)
axarr
[
1
,
0
].
axvline
(
np
.
mean
(
load_l
),
color
=
'g'
,
linestyle
=
'dashed'
,
linewidth
=
3
)
axarr
[
1
,
1
].
hist
(
load_d
*
2
/
val_d
,
bins
=
20
,
alpha
=
0.5
)
axarr
[
1
,
1
].
set_title
(
'Demes'
)
axarr
[
1
,
1
].
axvline
(
2
,
color
=
'r'
,
linestyle
=
'dashed'
,
linewidth
=
3
)
axarr
[
1
,
1
].
axvline
(
np
.
mean
(
load_d
),
color
=
'g'
,
linestyle
=
'dashed'
,
linewidth
=
3
)
axarr
[
1
,
1
].
set_xlim
(
1.85
,
2.15001
)
axarr
[
2
,
0
].
hist
(
load_c
*
2
/
val_c
,
bins
=
20
,
alpha
=
0.5
)
axarr
[
2
,
0
].
set_title
(
'DISC-SIM'
)
axarr
[
2
,
0
].
axvline
(
2
,
color
=
'r'
,
linestyle
=
'dashed'
,
linewidth
=
3
)
axarr
[
2
,
0
].
axvline
(
np
.
mean
(
load_c
),
color
=
'g'
,
linestyle
=
'dashed'
,
linewidth
=
3
)
# for i in axarr[]:
# i.set_xlim
plt
.
delaxes
(
axarr
[
2
,
1
])
f
.
text
(
0.5
,
0.04
,
'Dispersal Estimates'
,
ha
=
'center'
)
f
.
text
(
0.04
,
0.5
,
'Counts'
,
va
=
'center'
,
rotation
=
'vertical'
)
# Fine-tune figure; hide x ticks for top plots and y ticks for right plots
# plt.setp([a.get_xticklabels() for a in axarr[0, :]], visible=False)
# plt.setp([a.get_yticklabels() for a in axarr[:, 1]], visible=False)
plt
.
show
()
def
run_var_samp1
(
file_name
):
'''Runs MLE-estimates for various sample sizes and saves estimates and CIs.'''
grid
=
factory_Grid
()
# Create an empty Grid.
results
=
np
.
zeros
((
len
(
sample_sizes
),
nr_runs
,
6
))
# Container for the data
parameters
=
(
grid
.
sigma
,
grid
.
gridsize
,
sample_sizes
,
grid
.
dispmode
)
sample_steps
,
grid_size
=
grid
.
sample_steps
,
grid
.
gridsize
'''Actual runs:'''
row
=
0
for
k
in
sample_sizes
:
position_list
=
[(
i
+
sample_steps
/
2
,
j
+
sample_steps
/
2
,
0
)
for
i
in
range
(
0
,
grid_size
,
sample_steps
)
for
j
in
range
(
0
,
grid_size
,
sample_steps
)]
# position_list = [(i + sample_steps / 2, j + sample_steps / 2, 0) for i in range(0, sample_steps * k, sample_steps) for j in range(0, sample_steps * k, sample_steps)]
for
i
in
range
(
0
,
nr_runs
):
print
(
"Doing run: %.1f for %.0f samples"
%
(
i
,
k
))
grid
=
factory_Grid
()
grid
.
reset_grid
()
# Delete everything
shuffle
(
position_list
)
# Randomize position List
grid
.
set_samples
(
position_list
[:
k
])
# Set the samples
grid
.
update_t
(
t
)
# Do the actual run
if
grid
.
dispmode
==
"demes"
:
grid
.
update_IBD_blocks_demes
(
5
)
# Update position for deme analysis!!
# Do the maximum Likelihood estimation
mle_ana
=
grid
.
create_MLE_object
(
bin_pairs
=
True
)
# Create the MLE-object
mle_ana
.
create_mle_model
(
"constant"
,
grid
.
chrom_l
,
[
1
,
2
])
mle_ana
.
mle_analysis_error
()
d_mle
,
sigma_mle
=
mle_ana
.
estimates
[
0
],
mle_ana
.
estimates
[
1
]
ci_s
=
mle_ana
.
ci_s
results
[
row
,
i
,
:]
=
(
ci_s
[
1
][
0
],
ci_s
[
1
][
1
],
ci_s
[
0
][
0
],
ci_s
[
0
][
1
],
sigma_mle
,
d_mle
)
row
+=
1
# Go one down in the results_row
print
(
"RUN COMPLETE!!"
)
pickle
.
dump
((
results
,
parameters
),
open
(
file_name
,
"wb"
))
# Pickle the data
print
(
"SAVED"
)
def
analyze_var_samp1
(
file_name
):
'''Analyze the results of the MLE-estimates for various sample size'''
(
results
,
parameters
)
=
pickle
.
load
(
open
(
file_name
,
"rb"
))
print
(
"Parameters used for Simulations:
\n
"
)
print
(
parameters
)
result
=
3
# Position of result to analyze
ci_lengths
=
results
[
result
,
:,
1
]
-
results
[
result
,
:,
0
]
sigmas_mles
=
results
[
result
,
:,
4
]
d_mles
=
results
[
result
,
:,
5
]
print
(
"Mean CI length: %.4f"
%
np
.
mean
(
ci_lengths
))
print
(
"Mean sigma estimates: %.4f"
%
np
.
mean
(
sigmas_mles
))
print
(
"Standard Deviations sigma: %.4f"
%
np
.
std
(
sigmas_mles
))
print
(
"Mean D_e: %.4f"
%
(
np
.
mean
(
d_mles
)))
print
(
"Standard Deviations D_e: %.4f"
%
np
.
std
(
d_mles
))
k
=
len
(
results
[:,
0
,
0
])
# Calculate Confidence Intervalls:
ci_lengths_s
=
[
results
[
i
,
:,
1
]
-
results
[
i
,
:,
0
]
for
i
in
range
(
k
)]
ci_lengths_d
=
[
results
[
i
,
:,
3
]
-
results
[
i
,
:,
2
]
for
i
in
range
(
k
)]
# Calculate Empirical Confidence Intervals:
ci_lengths_s1
=
[
np
.
percentile
(
results
[
i
,
:,
4
],
97.5
)
-
np
.
percentile
(
results
[
i
,
:,
4
],
2.5
)
for
i
in
range
(
k
)]
ci_lengths_d1
=
[
np
.
percentile
(
results
[
i
,
:,
5
],
97.5
)
-
np
.
percentile
(
results
[
i
,
:,
5
],
2.5
)
for
i
in
range
(
k
)]
print
(
"
\n
Mean Length of est. Confidence Intervals (Sigma/D)"
)
print
(
np
.
mean
(
ci_lengths_s
,
axis
=
1
))
print
(
np
.
mean
(
ci_lengths_d
,
axis
=
1
))
print
(
"
\n
Empirical Confidence Intervals:"
)
print
(
ci_lengths_s1
)
print
(
ci_lengths_d1
)
# Now do the correlation of estimates:
print
(
"Correlation of Estimates"
)
print
([
np
.
corrcoef
(
results
[
i
,
:,
4
],
results
[
i
,
:,
5
])[
0
,
1
]
for
i
in
range
(
k
)])
# Plot Sigma Estimate
plt
.
figure
()
x_dist
=
np
.
linspace
(
0
,
3
,
num
=
len
(
sigmas_mles
))
ist
=
np
.
argsort
(
results
[
0
,
:,
4
])
plt
.
plot
(
0
+
x_dist
,
results
[
0
,
ist
,
4
],
'mo'
,
label
=
"MLE"
)
plt
.
vlines
(
0
+
x_dist
,
results
[
0
,
ist
,
0
],
results
[
0
,
ist
,
1
],
'r'
,
label
=
"Confidence Interval"
)
ist
=
np
.
argsort
(
results
[
1
,
:,
4
])
plt
.
vlines
(
3.5
+
x_dist
,
results
[
1
,
ist
,
0
],
results
[
1
,
ist
,
1
],
'r'
)
plt
.
plot
(
3.5
+
x_dist
,
results
[
1
,
ist
,
4
],
'mo'
)
ist
=
np
.
argsort
(
results
[
2
,
:,
4
])
plt
.
vlines
(
7
+
x_dist
,
results
[
2
,
ist
,
0
],
results
[
2
,
ist
,
1
],
'r'
)
plt
.
plot
(
7
+
x_dist
,
results
[
2
,
ist
,
4
],
'mo'
)
ist
=
np
.
argsort
(
results
[
3
,
:,
4
])
plt
.
vlines
(
10.5
+
x_dist
,
results
[
3
,
ist
,
0
],
results
[
3
,
ist
,
1
],
'r'
)
# plt.scatter(11 + x_dist, results[3, :, 0], c='b')
plt
.
plot
(
10.5
+
x_dist
,
results
[
3
,
ist
,
4
],
'mo'
)
plt
.
xlabel
(
"Sample Size"
,
fontsize
=
20
)
plt
.
ylabel
(
"Estimated "
+
r"$\mathbf{\sigma}$"
,
fontsize
=
20
)
plt
.
xticks
([
1.5
,
5
,
8.5
,
12
],
[
"100"
,
"270"
,
"440"
,
"625"
],
fontsize
=
20
)
plt
.
hlines
(
2
,
-
0.5
,
14
,
label
=
"True "
+
r"$\mathbf{\sigma}$"
,
color
=
'k'
,
linewidth
=
2
)
plt
.
legend
()
plt
.
ylim
([
0
,
3.5
])
plt
.
show
()
# Plot density estimate
plt
.
figure
()
x_dist
=
np
.
linspace
(
0
,
3
,
num
=
len
(
sigmas_mles
))
ist
=
np
.
argsort
(
results
[
0
,
:,
5
])
plt
.
plot
(
0
+
x_dist
,
results
[
0
,
ist
,
5
],
'mo'
,
label
=
"MLE"
)
plt
.
vlines
(
0
+
x_dist
,
results
[
0
,
ist
,
2
],
results
[
0
,
ist
,
3
],
'r'
,
label
=
"Confidence Interval"
)
ist
=
np
.
argsort
(
results
[
1
,
:,
5
])
plt
.
vlines
(
3.5
+
x_dist
,
results
[
1
,
ist
,
2
],
results
[
1
,
ist
,
3
],
'r'
)
plt
.
plot
(
3.5
+
x_dist
,
results
[
1
,
ist
,
5
],
'mo'
)
ist
=
np
.
argsort
(
results
[
2
,
:,
5
])
plt
.
vlines
(
7
+
x_dist
,
results
[
2
,
ist
,
2
],
results
[
2
,
ist
,
3
],
'r'
)
plt
.
plot
(
7
+
x_dist
,
results
[
2
,
ist
,
5
],
'mo'
)
ist
=
np
.
argsort
(
results
[
3
,
:,
5