Source code for lerot.experiment.GenericExperiment

# This file is part of Lerot.
#
# Lerot is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Lerot is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with Lerot.  If not, see <http://www.gnu.org/licenses/>.

import logging
import argparse
import gzip
import os.path
import sys
import yaml

from ..query import load_queries
from ..utils import get_class


[docs]class GenericExperiment: def __init__(self, args_str=None): # parse arguments parser = argparse.ArgumentParser(description=""" Construct and run a learning experiment. Provide either the name of a config file from which the experiment configuration is read, or provide all arguments listed under Command line. If both are provided the config file is ignored.""", prog=self.__class__.__name__) # option 1: use a config file file_group = parser.add_argument_group("FILE") file_group.add_argument("-f", "--file", help="Filename of the config " "file from which the experiment details" " should be read.") # option 2: specify all experiment details as arguments detail_group = parser.add_argument_group("DETAILS") detail_group.add_argument("-i", "--training_queries", help="File from which to load the training queries (svmlight " "format).") detail_group.add_argument("-j", "--test_queries", help="File from which to load the test queries (svmlight format).") detail_group.add_argument("-c", "--feature_count", type=int, help="The number of features included in the data.") detail_group.add_argument("-r", "--num_runs", type=int, help="Number of runs (how many times to repeat the experiment).") detail_group.add_argument("-q", "--num_queries", type=int, help="Number of queries in each run.") detail_group.add_argument("-u", "--user_model", help="Class implementing a user model.") detail_group.add_argument("-v", "--user_model_args", help="Arguments for initializing the user model.") # the retrieval system maintains ranking functions, accepts queries and # generates result lists, and in return receives user clicks to learn # from detail_group.add_argument("-s", "--system", help="Which system to use (e.g., pairwise, listwise).") detail_group.add_argument("-a", "--system_args", help="Arguments for " "the system (comparison method, learning " "algorithm and parameters...).") detail_group.add_argument("-o", "--output_dir", help="(Empty) directory for storing output generated by this" " experiment. Subdirectory for different folds will be generated" "automatically.") detail_group.add_argument("--output_dir_overwrite", default="False") detail_group.add_argument("-p", "--output_prefix", help="Prefix to be added to output filenames, e.g., the name of " "the data set, fold, etc. Output files will be stored as " "OUTPUT_DIR/PREFIX-RUN_ID.txt.gz") detail_group.add_argument("-e", "--experimenter", help="Experimenter type.") # run the parser if args_str: args = parser.parse_known_args(args_str.split())[0] else: args = parser.parse_known_args()[0] # determine whether to use config file or detailed args self.experiment_args = None self.args_file = args.file if args.file: config_file = open(args.file) self.experiment_args = yaml.load(config_file) config_file.close() # overwrite with command-line options if given for arg, value in vars(args).items(): if value: self.experiment_args[arg] = value else: self.experiment_args = vars(args) # workaround - check if we have all the arguments needed if not ("training_queries" in self.experiment_args and "test_queries" in self.experiment_args and "feature_count" in self.experiment_args and "num_runs" in self.experiment_args and "num_queries" in self.experiment_args and "user_model" in self.experiment_args and "user_model_args" in self.experiment_args and "system" in self.experiment_args and "system_args" in self.experiment_args and "output_dir" in self.experiment_args): parser.print_help() sys.exit("Missing required arguments, please check the program" " arguments or configuration file. %s" % self.experiment_args) # set default values for optional arguments if "query_sampling_method" not in self.experiment_args: self.experiment_args["query_sampling_method"] = "random" if "output_dir_overwrite" not in self.experiment_args: self.experiment_args["output_dir_overwrite"] = False if "experimenter" not in self.experiment_args: self.experiment_args["experimenter"] = \ "experiment.LearningExperiment" if "evaluation" not in self.experiment_args: self.experiment_args["evaluation"] = "evaluation.NdcgEval" if "processes" not in self.experiment_args: self.experiment_args["processes"] = 0 # locate or create directory for the current fold if not os.path.exists(self.experiment_args["output_dir"]): os.makedirs(self.experiment_args["output_dir"]) elif not(self.experiment_args["output_dir_overwrite"]) and \ os.listdir(self.experiment_args["output_dir"]): # make sure the output directory is empty raise Exception( "Output dir %s is not an empty directory. Please" " use a different directory, or move contents out of the way." % self.experiment_args["output_dir"]) logging.basicConfig(format='%(levelname)s %(module)s %(asctime)s: %(message)s', level=logging.INFO) logging.info("Arguments: %s" % self.experiment_args) # Printing out arguments that are used in execution for k, v in sorted(self.experiment_args.iteritems()): logging.info("\t%s: %s" % (k, v)) config_bk = os.path.join(self.experiment_args["output_dir"], "config_bk.yml") logging.info("Backing up configuration to: %s" % config_bk) config_bk_file = open(config_bk, "w") yaml.dump(self.experiment_args, config_bk_file, default_flow_style=False) config_bk_file.close() # load training and test queries training_file = self.experiment_args["training_queries"] test_file = self.experiment_args["test_queries"] self.feature_count = self.experiment_args["feature_count"] logging.info("Loading training data: %s " % training_file) self.training_queries = load_queries(training_file, self.feature_count) logging.info("... found %d queries." % self.training_queries.get_size()) logging.info("Loading test data: %s " % test_file) self.test_queries = load_queries(test_file, self.feature_count) logging.info("... found %d queries." % self.test_queries.get_size()) # initialize and run the experiment num_run times self.num_runs = self.experiment_args["num_runs"] self.output_dir = self.experiment_args["output_dir"] self.output_prefix = self.experiment_args["output_prefix"] self.experimenter = get_class(self.experiment_args["experimenter"])
[docs] def run(self): if self.experiment_args["processes"] > 1: from multiprocessing import Pool pool = Pool(processes=self.experiment_args["processes"]) results = [ pool.apply_async(self._run, (run_count,)) for run_count in range(self.num_runs) ] pool.close() pool.join() for result in results: logging.info("Ready: {}".format(result.ready())) logging.info("Successful: {}".format(result.successful())) return [result.get() for result in results] else: # Run the experiment num_runs times and return the list of results return [self._run(run_id) for run_id in range(self.num_runs)]
def _run(self, run_id): logging.info("run %d starts" % run_id) aux_log_file = os.path.join(self.output_dir, "_%s-%d.txt.gz" % (self.output_prefix, run_id)) aux_log_fh = gzip.open(aux_log_file, "wb") # Returns summary after running an experiment summarized_experiment = self.run_experiment(aux_log_fh) aux_log_fh.close() # Setup result log file log_file = os.path.join(self.output_dir, "%s-%d.txt.gz" % (self.output_prefix, run_id)) log_fh = gzip.open(log_file, "wb") yaml.dump(summarized_experiment, log_fh, default_flow_style=False) log_fh.close() logging.info("run %d done" % run_id) return summarized_experiment
[docs] def run_experiment(self, aux_log_fh): # Run an experiment with given parameters experiment = self.experimenter( self.training_queries, self.test_queries, self.feature_count, aux_log_fh, self.experiment_args) return experiment.run()