Commit 631d7f37 authored by Amelie Royer's avatar Amelie Royer

Cleaning up data generation files

parent 29253f18
......@@ -87,7 +87,6 @@ void load_model_parameters(std::string tfile, std::string rfile,
infile.open(tfile, std::ios::in);
assert((".transitions file not found", infile.is_open()));
int n_profile = 0;
double normalization[n_observations][n_actions] = {0};
while (std::getline(infile, line)) {
std::istringstream iss(line);
// Profile change
......@@ -103,25 +102,33 @@ void load_model_parameters(std::string tfile, std::string rfile,
}
}
// Accumulate
v *= profiles_prop.at(n_profile);
if (precision > 1) { v = std::trunc(v * precision); }
link = is_connected(s1, s2);
assert(("Unfeasible transition with >0 probability", link < n_actions));
transition_matrix[s1][a - 1][link] += v;
normalization[s1][a - 1] += v;
transitions_found++;
}
infile.close();
// Normalize transition matrix
double nrm;
for (s1 = 0; s1 < n_observations; s1++) {
for (a = 0; a < n_actions; a++) {
double nrm = normalization[s1][a];
//double nrm = normalization[s1][a];
nrm = std::accumulate(transition_matrix[s1][a],
transition_matrix[s1][a] + n_actions, 0.0);
std::transform(transition_matrix[s1][a],
transition_matrix[s1][a] + n_actions,
transition_matrix[s1][a],
[nrm](const double t){ return t / nrm; }
);
//double test = 0;
//for (size_t s2 = 0; s2 < n_actions; s2++) {
// test += transition_matrix[s1][a][s2];
// assert(("Im seriously crying", transition_matrix[s1][a][s2] >= 0 && transition_matrix[s1][a][s2] <= 1));
//}
//std::cout << "compare " << test << " " << nrm << "\n";
}
}
......
......@@ -35,9 +35,9 @@ static std::default_random_engine generator(time(NULL));
//T(s1, a, s2) = T(s1, a, connected[s1][s2]) if linked else 0
double transition_matrix [n_environments][n_observations][n_actions][n_actions] = {0};
static double transition_matrix [n_environments][n_observations][n_actions][n_actions];
//R(s1, a, s2) = R(s1, connected[s1][s2]) if a == connected[s1][s2] else 0
double rewards [n_observations][n_actions];
static double rewards [n_observations][n_actions];
/*! \brief Loads the Model parameters from the precomputed data files.
......@@ -61,6 +61,7 @@ void load_model_parameters(std::string tfile, std::string rfile,
// Check summary file
check_summary_file(sfile, true);
// Load rewards
infile.open(rfile, std::ios::in);
assert((".rewards file not found", infile.is_open()));
......@@ -74,11 +75,11 @@ void load_model_parameters(std::string tfile, std::string rfile,
assert(("Missing links while parsing .rewards file",
links_found == n_observations * n_actions));
infile.close();
// Load transitions
infile.open(tfile, std::ios::in);
assert((".transitions file not found", infile.is_open()));
double normalization [n_environments][n_observations][n_actions] = {0};
//double normalization [n_environments][n_observations][n_actions] = {0};
while (std::getline(infile, line)) {
std::istringstream iss(line);
// Change profile
......@@ -96,18 +97,20 @@ void load_model_parameters(std::string tfile, std::string rfile,
link = is_connected(s1, s2);
assert(("Unfeasible transition with >0 probability", link < n_actions));
transition_matrix[profiles_found][s1][a - 1][link] = v;
normalization[profiles_found][s1][a - 1] += v;
//normalization[profiles_found][s1][a - 1] += v;
transitions_found++;
}
assert(("Missing profiles in .transitions file", profiles_found == n_environments));
infile.close();
// Normalize transition matrix
// Normalize transition matrix [sparing memory]
double nrm;
for (p = 0; p < n_environments; p++) {
for (s1 = 0; s1 < n_observations; s1++) {
for (a = 0; a < n_actions; a++) {
nrm = normalization[p][s1][a];
nrm = std::accumulate(transition_matrix[p][s1][a],
transition_matrix[p][s1][a] + n_actions, 0);
//nrm = normalization[p][s1][a];
std::transform(transition_matrix[p][s1][a],
transition_matrix[p][s1][a] + n_actions,
transition_matrix[p][s1][a],
......@@ -116,7 +119,6 @@ void load_model_parameters(std::string tfile, std::string rfile,
}
}
}
}
......@@ -289,7 +291,7 @@ int main(int argc, char* argv[]) {
datafile_base + ".summary",
std::pow(10, precision));
double loading_time = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - start).count() / 1000000.;
//return 0;
// Assert correct sizes
assert(("Error in TRANSITION_MATRIX initialization",
sizeof(transition_matrix)/sizeof(****transition_matrix) ==
......
......@@ -19,7 +19,7 @@ STDLIB="/usr/local/bin/gcc-4.9.0/lib64"
# GLOBAL VARIABLES
PROFILES=6
ALPHA=1.10
STEPS=1200
STEPS=5000
PRECISION=0
# DATA PATH
......
......@@ -15,6 +15,14 @@
#include <math.h>
int ndigits(int n) {
int i = 0;
while (n > 0) {
n = n/10;
i++;
}
return i;
}
/**
* CHECK_SUMMARY_FILE
......
......@@ -63,6 +63,8 @@ const size_t n_observations = (pow(NITEMS, HIST + 1) - 1) / (NITEMS - 1); /*!< N
const size_t n_states = NPROFILES * n_observations; /*!< Number of states in the MEMDP */
int ndigits(int n);
/*! \brief Asserts that the information contained in the summary file match the
* parameters given at compilation time.
*
......
This diff is collapsed.
......@@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
"""
Generate a synthetic "dummy" POMDP model based on Foodmart.
Generate a synthetic POMDP model with high discrepancy between environments.
"""
__author__ = "Amelie Royer"
__email__ = "amelie.royer@ist.ac.at"
......@@ -12,13 +12,12 @@ import sys, os
import csv
import argparse
import numpy as np
from collections import defaultdict
from random import randint
from utils import *
def init_output_dir(plevel, ulevel, hlength):
def init_output_dir(plevel, hlength):
"""
Initializa the output directory.
......@@ -32,97 +31,14 @@ def init_output_dir(plevel, ulevel, hlength):
* ``output_base`` (*str*): base name for output files.
"""
import shutil
output_base = "random_u%d_k%d_pl%d" % (ulevel, hlength, plevel)
output_dir = os.path.join(args.output, "Random%d%d%d" % (ulevel, hlength, plevel))
output_base = "random_u%d_k%d_pl%d" % (plevel, hlength, plevel)
output_dir = os.path.join(args.output, "Random%d%d%d" % (plevel, hlength, plevel))
if os.path.isdir(output_dir):
shutil.rmtree(output_dir)
os.makedirs(output_dir)
return os.path.join(output_dir, output_base)
def load_data(base_name, plevel, ulevel, hlength, trfr, sv=False):
"""
Load and pre-format the Foodmart data (products, customers and user sessions).
Args:
* ``base_name`` (*str*): path to the main data folder.
* ``plevel`` (*int*): level parameter for the product clustering.
* ``hlength`` (*int*): history length.
* ``sv`` (*bool, optional*): if True, store the computed informations in .items, .profiles, .train and .test
Returns:
* ``product_to_cluster`` (*ndarray*): maps a productID to a clusterID. Note 0 -> -1 is the empty selection.
* ``customer_to_cluster`` (*ndarray*): maps a customerID to a clusterID.
"""
###### Load and Cluster items
#########################################################################
print "\n\033[92m-----> Load and Cluster products\033[0m"
tmp_index = {} # Cluster name -> Cluster ID
# Load product list
if plevel == 0:
with open(os.path.join(base_name, "product.csv"), 'r') as f:
r = csv.reader(f)
r.next()
for product in r:
try:
tmp_index[product[3]]
except KeyError:
tmp_index[product[3]] = len(tmp_index) + 1
else:
# Load product categories
product_classes = {}
with open(os.path.join(base_name, "product_class.csv"), 'r') as f:
r = csv.reader(f)
r.next()
for categories in r:
product_classes[int(categories[0])] = categories[plevel]
# Cluster products
with open(os.path.join(base_name, "product.csv"), 'r') as f:
r = csv.reader(f)
r.next()
for product in r:
try:
tmp_index[product_classes[int(product[0])]]
except KeyError:
tmp_index[product_classes[int(product[0])]] = len(tmp_index) + 1
# Print summary
actions = sorted(tmp_index.values())
n_items = len(actions)
# Init output folder
if sv:
output_base = init_output_dir(plevel, n_items, hlength)
init_base_writing(len(actions), args.history)
###### Load and Cluster users by profile
#########################################################################
customer_to_cluster = np.zeros(line_count(os.path.join(base_name, "customer.csv")), dtype="int") - 1 # Customer ID -> Cluster ID
tmp_index_u = {} # Cluster name -> Cluster ID
with open(os.path.join(base_name, "customer.csv"), 'r') as f:
r = csv.reader(f)
r.next()
for user in r:
customerID = int(user[0])
try:
clusterID = tmp_index_u[assign_customer_cluster(user)]
except KeyError:
clusterID = len(tmp_index_u)
tmp_index_u[assign_customer_cluster(user)] = clusterID
customer_to_cluster[customerID] = clusterID
n_users = len(tmp_index_u)
# Return values
return actions, n_items, get_nstates(n_items, hlength), n_users, output_base
......@@ -131,36 +47,33 @@ def load_data(base_name, plevel, ulevel, hlength, trfr, sv=False):
if __name__ == "__main__":
###### 0. Set Parameters
parser = argparse.ArgumentParser(description='Extract POMDP transition probabilities from the Foodmart dataset.')
parser.add_argument('-d', '--data', type=str, default="/home/amelie/Rotations/ChatterjeeRotation/Data/Foodmart/data", help="Path to data directory.")
parser.add_argument('-o', '--output', type=str, default="/home/amelie/Rotations/ChatterjeeRotation/Code/Models", help="Path to output directory.")
parser.add_argument('-pl', '--plevel', type=int, default=4, help="Clustering level for product categorization (0: no lumping to 4:lumping by family). See product classes hierarchy.")
parser.add_argument('-ul', '--ulevel', type=int, default=0, help="Clustering level for user categorization.")
base_folder = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
parser = argparse.ArgumentParser(description='Generate syntehtic POMCP parameters with a high discrepancy between environments.')
parser.add_argument('-o', '--output', type=str, default=os.path.join(base_folder, "Code", "Models"), help="Path to output directory.")
parser.add_argument('-n', '--nactions', type=int, default=3, help="Number of items (actions) and clusters.")
parser.add_argument('-k', '--history', type=int, default=2, help="Length of the history to consider for one state of the MEMDP.")
parser.add_argument('-t', '--train', type=float, default=0.8, help="Fraction of training data to extract from the database.")
parser.add_argument('--ordered', action='store_true', help="If present, the states of the MEMDP are ordered product sequences. TODO.")
parser.add_argument('-t', '--test', type=int, default=2000, help="Number of test sessions to generate.")
#parser.add_argument('--ordered', action='store_true', help="If present, the states of the MEMDP are ordered product sequences. TODO.")
parser.add_argument('--norm', action='store_true', help="If present, normalize the output transition probabilities.")
parser.add_argument('--draw', action='store_true', help="If present, draw the first user MDP model.")
args = parser.parse_args()
###### 1. Check assertions
assert(args.train >= 0 and args.train <= 1), "Training fraction must be between 0 and 1 (included)"
assert(args.plevel in [0, 1, 2, 3, 4]), "plevel argument must be in [0, 1, 2, 3, 4]"
assert(args.ulevel == 0), "ulevel must be in 0"
###### 0-bis. Check assertions
assert(args.nactions > 0), "plevel argument must be strictly positive"
assert(args.history > 1), "history length must be strictly greater than 1"
assert(args.test > 0), "Number of test sessions must be strictly positive"
logger = Logger(sys.stdout)
sys.stdout = logger
limit = 5000
n_test = 2000
###### 1. Load data and create product/user profile
actions, n_items, n_states, n_users, output_base = load_data(args.data, args.plevel, args.ulevel, args.history, args.train, sv=True)
n_users = n_items
exc = 4 * (n_users -1)
n_items = args.nactions
n_users = args.nactions
actions = range(1, n_items + 1)
init_base_writing(n_items, args.history)
n_states = get_nstates(n_items, args.history)
output_base = init_output_dir(args.nactions, args.history)
exc = 4 * (n_users - 1) #Ensure 0.8
#### 2. Write dummy files
with open("%s.items" % output_base, "w") as f:
......@@ -172,7 +85,7 @@ if __name__ == "__main__":
##### Create dummy test sessions
with open("%s.test" % output_base, 'w') as f:
for user in xrange(n_test):
for user in xrange(args.test):
cluster = randint(0, n_users - 1)
lgth = randint(10, 100)
session = [0]
......@@ -215,15 +128,13 @@ if __name__ == "__main__":
for link in actions:
count = exc if link == user_profile + 1 else 1
s2 = get_next_state_id(s1, link)
f.write("%d\t%d\t%d\t%s\n" % (s1, a, s2, count))
f.write("%d\t%d\t%d\t%s\n" % (s1, a, s2, count if not args.norm else count / (exc + n_actions - 1)))
f.write("\n")
print "\n\n\033[92m-----> End\033[0m"
print " All outputs are in %s" % output_base
with open("%s.summary" % output_base, 'w') as f:
f.write("%d States\n%d Actions (Items)\n%d user profiles\n%d history length\n%d product clustering level\n\n%s" % (n_states, n_items, n_users, args.history, args.plevel, logger.to_string()))
f.write("%d States\n%d Actions (Items)\n%d user profiles\n%d history length\n%d product clustering level\n\n%s" % (n_states, n_items, n_users, args.history, args.nactions, logger.to_string()))
print
# End
......@@ -34,22 +34,18 @@ class Logger:
return self.logfile.getvalue()
def line_count(filename):
def line_count(f):
"""
Returns the number of lines in the given file f.
Args:
* ``filename`` (*str*): path to file.
* ``f`` (*File*): file.
Returns:
* ``nlines`` (*int*): number of lines in f.
"""
with open(filename, "r+") as f:
buf = mmap.mmap(f.fileno(), 0)
nlines = 0
readline = buf.readline
while readline():
nlines += 1
nlines= len(f.readlines())
f.close()
return nlines
......@@ -110,7 +106,7 @@ def get_n_customer_cluster(ulevel):
if ulevel == 0:
return 6
else:
print >> sys.stderr, "Unknown ulevel = %s option. Exit." %ulevel
print >> sys.stderr, "Unknown ulevel = %d option. Exit." % ulevel
raise SystemExit
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment