Commit 41261438 authored by Amelie Royer's avatar Amelie Royer

Adding Data pasring scripts

parents
This diff is collapsed.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Generate a synthetic "dummy" POMDP model based on Foodmart.
"""
__author__ = "Amelie Royer"
__email__ = "amelie.royer@ist.ac.at"
import sys, os
import csv
import argparse
import numpy as np
from collections import defaultdict
from random import randint
from utils import *
def init_output_dir(plevel, ulevel, hlength):
"""
Initializa the output directory.
Args:
* ``plevel`` (*int*): level parameter for the product clustering.
* ``ulevel`` (*int*): level parameter for the customer clustering.
* ``hlength`` (*int*): history length.
* ``alpha`` (*float, optional*): positive probability scaling for the recommended action.
Returns:
* ``output_base`` (*str*): base name for output files.
"""
import shutil
output_base = "random_u%d_k%d_pl%d" % (ulevel, hlength, plevel)
output_dir = os.path.join(args.output, "Random%d%d%d" % (ulevel, hlength, plevel))
if os.path.isdir(output_dir):
shutil.rmtree(output_dir)
os.makedirs(output_dir)
return os.path.join(output_dir, output_base)
def load_data(base_name, plevel, ulevel, hlength, trfr, sv=False):
"""
Load and pre-format the Foodmart data (products, customers and user sessions).
Args:
* ``base_name`` (*str*): path to the main data folder.
* ``plevel`` (*int*): level parameter for the product clustering.
* ``hlength`` (*int*): history length.
* ``sv`` (*bool, optional*): if True, store the computed informations in .items, .profiles, .train and .test
Returns:
* ``product_to_cluster`` (*ndarray*): maps a productID to a clusterID. Note 0 -> -1 is the empty selection.
* ``customer_to_cluster`` (*ndarray*): maps a customerID to a clusterID.
"""
###### Load and Cluster items
#########################################################################
print "\n\033[92m-----> Load and Cluster products\033[0m"
tmp_index = {} # Cluster name -> Cluster ID
# Load product list
if plevel == 0:
with open(os.path.join(base_name, "product.csv"), 'r') as f:
r = csv.reader(f)
r.next()
for product in r:
try:
tmp_index[product[3]]
except KeyError:
tmp_index[product[3]] = len(tmp_index) + 1
else:
# Load product categories
product_classes = {}
with open(os.path.join(base_name, "product_class.csv"), 'r') as f:
r = csv.reader(f)
r.next()
for categories in r:
product_classes[int(categories[0])] = categories[plevel]
# Cluster products
with open(os.path.join(base_name, "product.csv"), 'r') as f:
r = csv.reader(f)
r.next()
for product in r:
try:
tmp_index[product_classes[int(product[0])]]
except KeyError:
tmp_index[product_classes[int(product[0])]] = len(tmp_index) + 1
# Print summary
actions = sorted(tmp_index.values())
n_items = len(actions)
# Init output folder
if sv:
output_base = init_output_dir(plevel, n_items, hlength)
init_base_writing(len(actions), args.history)
###### Load and Cluster users by profile
#########################################################################
customer_to_cluster = np.zeros(line_count(os.path.join(base_name, "customer.csv")), dtype="int") - 1 # Customer ID -> Cluster ID
tmp_index_u = {} # Cluster name -> Cluster ID
with open(os.path.join(base_name, "customer.csv"), 'r') as f:
r = csv.reader(f)
r.next()
for user in r:
customerID = int(user[0])
try:
clusterID = tmp_index_u[assign_customer_cluster(user)]
except KeyError:
clusterID = len(tmp_index_u)
tmp_index_u[assign_customer_cluster(user)] = clusterID
customer_to_cluster[customerID] = clusterID
n_users = len(tmp_index_u)
# Return values
return actions, n_items, get_nstates(n_items, hlength), n_users, output_base
##################################################### M A I N R O U T I N E #######
if __name__ == "__main__":
###### 0. Set Parameters
parser = argparse.ArgumentParser(description='Extract POMDP transition probabilities from the Foodmart dataset.')
parser.add_argument('-d', '--data', type=str, default="/home/amelie/Rotations/ChatterjeeRotation/Data/Foodmart/data", help="Path to data directory.")
parser.add_argument('-o', '--output', type=str, default="/home/amelie/Rotations/ChatterjeeRotation/Code/Models", help="Path to output directory.")
parser.add_argument('-pl', '--plevel', type=int, default=4, help="Clustering level for product categorization (0: no lumping to 4:lumping by family). See product classes hierarchy.")
parser.add_argument('-ul', '--ulevel', type=int, default=0, help="Clustering level for user categorization.")
parser.add_argument('-k', '--history', type=int, default=2, help="Length of the history to consider for one state of the MEMDP.")
parser.add_argument('-t', '--train', type=float, default=0.8, help="Fraction of training data to extract from the database.")
parser.add_argument('--ordered', action='store_true', help="If present, the states of the MEMDP are ordered product sequences. TODO.")
parser.add_argument('--norm', action='store_true', help="If present, normalize the output transition probabilities.")
parser.add_argument('--draw', action='store_true', help="If present, draw the first user MDP model.")
args = parser.parse_args()
###### 1. Check assertions
assert(args.train >= 0 and args.train <= 1), "Training fraction must be between 0 and 1 (included)"
assert(args.plevel in [0, 1, 2, 3, 4]), "plevel argument must be in [0, 1, 2, 3, 4]"
assert(args.ulevel == 0), "ulevel must be in 0"
assert(args.history > 1), "history length must be strictly greater than 1"
logger = Logger(sys.stdout)
sys.stdout = logger
limit = 5000
n_test = 2000
###### 1. Load data and create product/user profile
actions, n_items, n_states, n_users, output_base = load_data(args.data, args.plevel, args.ulevel, args.history, args.train, sv=True)
n_users = n_items
exc = 4 * (n_users -1)
#### 2. Write dummy files
with open("%s.items" % output_base, "w") as f:
f.write("\n".join("Item %d" % i for i in xrange(n_items)))
with open("%s.profiles" % output_base, "w") as f:
f.write("\n".join("%d\t1\t1" % i for i in xrange(n_users)))
##### Create dummy test sessions
with open("%s.test" % output_base, 'w') as f:
for user in xrange(n_test):
cluster = randint(0, n_users - 1)
lgth = randint(10, 100)
session = [0]
for _ in xrange(lgth):
a = randint(0, exc + n_users - 2)
if a < n_users - 1:
if a == cluster:
a = n_users - 1
a += 1
else:
a = cluster + 1
s2 = get_next_state_id(session[-1], a)
session.append(a)
session.append(s2)
f.write("%d\t%d\t%s\n" % (user, cluster, ' '.join(str(x) for x in session) ))
###### 2. Set rewards
with open("%s.rewards" % output_base, 'w') as f:
for s1 in xrange(n_states):
for item in actions:
f.write("%d\t%d\t%d\t%.5f\n" % (s1, item, get_next_state_id(s1, item), 1))
###### 3. Assign random transition probabilities
print "\n\033[91m-----> Probability inference\033[0m"
with open("%s.transitions" % output_base, 'w') as f:
for user_profile in xrange(n_users):
print >> sys.stderr, "\n > Profile %d / %d: \n" % (user_profile + 1, n_users),
sys.stderr.flush()
# For fixed s1
for s1 in xrange(n_states):
sys.stderr.write(" state: %d / %d \r" % (s1 + 1, n_states))
sys.stderr.flush()
# For fixed a
for a in actions:
# For every s2, sample T(s1, a, s2)
for link in actions:
count = exc if link == user_profile + 1 else 1
s2 = get_next_state_id(s1, link)
f.write("%d\t%d\t%d\t%s\n" % (s1, a, s2, count))
f.write("\n")
print "\n\n\033[92m-----> End\033[0m"
print " All outputs are in %s" % output_base
with open("%s.summary" % output_base, 'w') as f:
f.write("%d States\n%d Actions (Items)\n%d user profiles\n%d history length\n%d product clustering level\n\n%s" % (n_states, n_items, n_users, args.history, args.plevel, logger.to_string()))
print
# End
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Utilitary functions for data preprocessing
"""
__author__ = "Amelie Royer"
__email__ = "amelie.royer@ist.ac.at"
import mmap
import numpy as np
from StringIO import StringIO
class Logger:
"""
Capture print statments and write them to a log file while printing on the screen.
"""
def __init__(self, stdout):
self.stdout = stdout
self.logfile = StringIO()
def write(self, text):
self.stdout.write(text)
self.logfile.write(text)
self.logfile.flush()
def close(self):
self.stdout.close()
self.logfile.close()
def to_string(self):
return self.logfile.getvalue()
def line_count(filename):
"""
Returns the number of lines in the given file f.
Args:
* ``filename`` (*str*): path to file.
Returns:
* ``nlines`` (*int*): number of lines in f.
"""
with open(filename, "r+") as f:
buf = mmap.mmap(f.fileno(), 0)
nlines = 0
readline = buf.readline
while readline():
nlines += 1
return nlines
def get_nstates(n_items, hlength):
"""
Returns the number of states in the MDP
Args:
* ``n_items`` (*int*): number of items/products available to the user.
* ``hlength`` (*int*): history length to consider.
Returns:
* ``n_states`` (*str*): number of states in the corresponding MDPs.
"""
return (n_items ** (hlength + 1) - 1) / (n_items - 1)
def assign_customer_cluster(user):
"""
Assigns user profile given customer data from the foodmart dataset.
Args:
* ``user`` (*list*): data for one customer extracted from the foodmart dataset.
Returns:
* ``cluster`` (*int*): Cluster ID
"""
gender = int(user[19] == 'F')
age_category = ((1997 - int(user[16].split('-', 1)[0])) / 10) / 3
return gender * 10 + age_category
def print_customer_cluster(cluster):
"""
Returns the string representation for a cluster ID.
Args:
* ``cluster`` (*int*): Cluster ID
Returns:
* ``cluster_str`` (*str*): String representation of the cluster ID
"""
return "%s in the %d+ years old category" %("Female" if cluster / 10 else "Male", 30 * (cluster % 10))
def get_n_customer_cluster(ulevel):
"""
Returns the number of user profiles for the given clustering parameter.
Args:
* ``ulevel`` (*int*): user clustering parameter.
Returns:
* ``n_clusters`` (*int*): Number of clusters that will be created.
"""
if ulevel == 0:
return 6
else:
print >> sys.stderr, "Unknown ulevel = %s option. Exit." %ulevel
raise SystemExit
def assign_product_cluster(product, product_classes, plevel):
"""
Assigns product profile given product data from the foodmart dataset.
Args:
* ``product`` (*list*): data for one product extracted from the foodmart dataset.
* ``product_classes`` (*list*): data extracted from the product_class.csv in the dataset.
* ``plevel`` (*int*): level of clustering in the database (0: fine grained to 4: rough)
Returns:
* ``cluster`` (*str*): Cluster ID
"""
if plevel == 0: # Product
return product[3]
elif plevel == 1: # Product class/subcategory
return product_classes[int(product[0])][0]
elif plevel == 2: # Product category
return product_classes[int(product[0])][1]
elif plevel == 3: # Product department
return product_classes[int(product[0])][2]
elif plevel == 4: # Product family
return product_classes[int(product[0])][3]
pows = []
"""
``pows`` contains the precomputed exponents for the use of base (n_items) in decreasing order, used for state to index encryption.
"""
acpows = []
"""
``acpows`` contains the cumulative sum of the values in ``pows``.
"""
n_states = 0
"""
``n_states`` contains the number of states in the model.
"""
def init_base_writing(n_items, hlength):
"""
Inits the global variables ``pows`` that contains the precomputed exponents for the base (n_items + 1), and ``n_states`` that indicates the number of state in the system..
Args:
* ``n_items``: number of items in the task (not counting the empty selection).
* ``hlength`` (*int*): history length.
"""
global pows, acpows, n_states
pows = [1] * hlength
acpows = [1] * hlength
for i in xrange(1, hlength):
pows[hlength - 1 - i] = pows[hlength - i] * n_items
acpows[hlength - 1 - i] = acpows[hlength - i] + pows[hlength - 1 - i]
n_states = get_nstates(n_items, hlength)
def state_indx(item_list):
"""
Returns the index for a state in the model.
Args:
* ``item_list`` (*int list*): input state; ordered sequence of items where the first item represented the oldest choice.
"""
global pows, n_states
id = sum(x * p for x, p in zip(item_list, pows))
assert(id < n_states), "out-of-bound state index: %d" % id
return id
def get_next_state_id(s, i):
"""
Given the current state and the next item chosen by the user, returns the correpsonding next state.
Args:
* ``s`` (*int*): current state index.
* ``i`` (*int*): next user choice.
"""
global pows, acpows
cd, rst = divmod(s - acpows[0], pows[0])
if cd < -1:
return (rst - pows[0]) * pows[-2] + (i - 1) + acpows[0]
else:
return rst * pows[-2] + (i - 1) + acpows[0]
def id_to_state(s):
"""
Returns the sequence of items corresponding to a state index.
Args:
* ``s`` (*int*): input state index
Returns
* ``item_list`` (*int list*): ordered sequence of items corresponding to ``s``, where the first item represented the oldest choice.
"""
global pows, acpows
real = s
output = [0] * len(pows)
i = 0
while real > pows[-2]:
cd, rst = divmod(real - acpows[i], pows[i])
if cd < -1:
rst -= pows[i]
else:
output[i] = cd + 1
real = rst + acpows[i + 1]
i += 1
output[-1] = real
return output
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment