# This file is part of Lerot.
#
# Lerot is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Lerot is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with Lerot. If not, see <http://www.gnu.org/licenses/>.
import logging
import argparse
import gzip
import os.path
import sys
import yaml
from ..query import load_queries
from ..utils import get_class
[docs]class GenericExperiment:
def __init__(self, args_str=None):
# parse arguments
parser = argparse.ArgumentParser(description="""
Construct and run a learning experiment. Provide either the name
of a config file from which the experiment configuration is
read, or provide all arguments listed under Command line. If
both are provided the config file is ignored.""",
prog=self.__class__.__name__)
# option 1: use a config file
file_group = parser.add_argument_group("FILE")
file_group.add_argument("-f", "--file", help="Filename of the config "
"file from which the experiment details"
" should be read.")
# option 2: specify all experiment details as arguments
detail_group = parser.add_argument_group("DETAILS")
detail_group.add_argument("-i", "--training_queries",
help="File from which to load the training queries (svmlight "
"format).")
detail_group.add_argument("-j", "--test_queries",
help="File from which to load the test queries (svmlight format).")
detail_group.add_argument("-c", "--feature_count", type=int,
help="The number of features included in the data.")
detail_group.add_argument("-r", "--num_runs", type=int,
help="Number of runs (how many times to repeat the experiment).")
detail_group.add_argument("-q", "--num_queries", type=int,
help="Number of queries in each run.")
detail_group.add_argument("-u", "--user_model",
help="Class implementing a user model.")
detail_group.add_argument("-v", "--user_model_args",
help="Arguments for initializing the user model.")
# the retrieval system maintains ranking functions, accepts queries and
# generates result lists, and in return receives user clicks to learn
# from
detail_group.add_argument("-s", "--system",
help="Which system to use (e.g., pairwise, listwise).")
detail_group.add_argument("-a", "--system_args", help="Arguments for "
"the system (comparison method, learning "
"algorithm and parameters...).")
detail_group.add_argument("-o", "--output_dir",
help="(Empty) directory for storing output generated by this"
" experiment. Subdirectory for different folds will be generated"
"automatically.")
detail_group.add_argument("--output_dir_overwrite", default="False")
detail_group.add_argument("-p", "--output_prefix",
help="Prefix to be added to output filenames, e.g., the name of "
"the data set, fold, etc. Output files will be stored as "
"OUTPUT_DIR/PREFIX-RUN_ID.txt.gz")
detail_group.add_argument("-e", "--experimenter",
help="Experimenter type.")
# run the parser
if args_str:
args = parser.parse_known_args(args_str.split())[0]
else:
args = parser.parse_known_args()[0]
# determine whether to use config file or detailed args
self.experiment_args = None
self.args_file = args.file
if args.file:
config_file = open(args.file)
self.experiment_args = yaml.load(config_file)
config_file.close()
# overwrite with command-line options if given
for arg, value in vars(args).items():
if value:
self.experiment_args[arg] = value
else:
self.experiment_args = vars(args)
# workaround - check if we have all the arguments needed
if not ("training_queries" in self.experiment_args and
"test_queries" in self.experiment_args and
"feature_count" in self.experiment_args and
"num_runs" in self.experiment_args and
"num_queries" in self.experiment_args and
"user_model" in self.experiment_args and
"user_model_args" in self.experiment_args and
"system" in self.experiment_args and
"system_args" in self.experiment_args and
"output_dir" in self.experiment_args):
parser.print_help()
sys.exit("Missing required arguments, please check the program"
" arguments or configuration file. %s" %
self.experiment_args)
# set default values for optional arguments
if "query_sampling_method" not in self.experiment_args:
self.experiment_args["query_sampling_method"] = "random"
if "output_dir_overwrite" not in self.experiment_args:
self.experiment_args["output_dir_overwrite"] = False
if "experimenter" not in self.experiment_args:
self.experiment_args["experimenter"] = \
"experiment.LearningExperiment"
if "evaluation" not in self.experiment_args:
self.experiment_args["evaluation"] = "evaluation.NdcgEval"
if "processes" not in self.experiment_args:
self.experiment_args["processes"] = 0
# locate or create directory for the current fold
if not os.path.exists(self.experiment_args["output_dir"]):
os.makedirs(self.experiment_args["output_dir"])
elif not(self.experiment_args["output_dir_overwrite"]) and \
os.listdir(self.experiment_args["output_dir"]):
# make sure the output directory is empty
raise Exception(
"Output dir %s is not an empty directory. Please"
" use a different directory, or move contents out of the way."
% self.experiment_args["output_dir"])
logging.basicConfig(format='%(levelname)s %(module)s %(asctime)s: %(message)s',
level=logging.INFO)
logging.info("Arguments: %s" % self.experiment_args)
# Printing out arguments that are used in execution
for k, v in sorted(self.experiment_args.iteritems()):
logging.info("\t%s: %s" % (k, v))
config_bk = os.path.join(self.experiment_args["output_dir"],
"config_bk.yml")
logging.info("Backing up configuration to: %s" % config_bk)
config_bk_file = open(config_bk, "w")
yaml.dump(self.experiment_args,
config_bk_file,
default_flow_style=False)
config_bk_file.close()
# load training and test queries
training_file = self.experiment_args["training_queries"]
test_file = self.experiment_args["test_queries"]
self.feature_count = self.experiment_args["feature_count"]
logging.info("Loading training data: %s " % training_file)
self.training_queries = load_queries(training_file, self.feature_count)
logging.info("... found %d queries." %
self.training_queries.get_size())
logging.info("Loading test data: %s " % test_file)
self.test_queries = load_queries(test_file, self.feature_count)
logging.info("... found %d queries." % self.test_queries.get_size())
# initialize and run the experiment num_run times
self.num_runs = self.experiment_args["num_runs"]
self.output_dir = self.experiment_args["output_dir"]
self.output_prefix = self.experiment_args["output_prefix"]
self.experimenter = get_class(self.experiment_args["experimenter"])
[docs] def run(self):
if self.experiment_args["processes"] > 1:
from multiprocessing import Pool
pool = Pool(processes=self.experiment_args["processes"])
results = [
pool.apply_async(self._run, (run_count,))
for run_count in range(self.num_runs)
]
pool.close()
pool.join()
for result in results:
logging.info("Ready: {}".format(result.ready()))
logging.info("Successful: {}".format(result.successful()))
return [result.get() for result in results]
else:
# Run the experiment num_runs times and return the list of results
return [self._run(run_id) for run_id in range(self.num_runs)]
def _run(self, run_id):
logging.info("run %d starts" % run_id)
aux_log_file = os.path.join(self.output_dir, "_%s-%d.txt.gz" %
(self.output_prefix, run_id))
aux_log_fh = gzip.open(aux_log_file, "wb")
# Returns summary after running an experiment
summarized_experiment = self.run_experiment(aux_log_fh)
aux_log_fh.close()
# Setup result log file
log_file = os.path.join(self.output_dir, "%s-%d.txt.gz" %
(self.output_prefix, run_id))
log_fh = gzip.open(log_file, "wb")
yaml.dump(summarized_experiment, log_fh, default_flow_style=False)
log_fh.close()
logging.info("run %d done" % run_id)
return summarized_experiment
[docs] def run_experiment(self, aux_log_fh):
# Run an experiment with given parameters
experiment = self.experimenter(
self.training_queries, self.test_queries, self.feature_count,
aux_log_fh, self.experiment_args)
return experiment.run()