15 ss <<
"-C " << p.
tmp_dir <<
" ";
16 ss <<
"`ls " << p.
tmp_dir <<
"`";
22 BENCHMAX_LOG_WARN(
"benchmax.slurm",
"Archiving of log files failed with exit code " << code);
29 std::vector<fs::path> files;
31 std::regex filenamere(
"JOB.[0-9]+_[0-9]+.(out|err)");
32 for (
const auto& f: fs::directory_iterator(basedir)) {
33 if (!std::regex_match(f.path().filename().string(), filenamere)) {
38 files.emplace_back(f.path());
40 BENCHMAX_LOG_INFO(
"benchmax.slurm",
"Collected " << files.size() <<
" log files.");
47 std::ofstream out(filename);
48 out <<
"#!/usr/bin/env zsh" << std::endl;
49 out <<
"### Job name" << std::endl;
51 out <<
"#SBATCH --job-name=benchmax" << std::endl;
53 out <<
"#SBATCH -o " << p.
tmp_dir <<
"/JOB.%A_%a.out" << std::endl;
54 out <<
"#SBATCH -e " << p.
tmp_dir <<
"/JOB.%A_%a.err" << std::endl;
58 long minutes = std::chrono::duration_cast<std::chrono::minutes>(timeout * p.
tasks / p.
slices).count() * 2;
59 minutes = std::min(minutes,
static_cast<long>(60*24));
61 out <<
"#SBATCH -t " << minutes << std::endl;
63 out <<
"#SBATCH --mem-per-cpu=" << p.
limit_memory.mebi() + 1024 <<
"M" << std::endl;
66 out <<
"source ~/load_environment" << std::endl;
68 out <<
"cd " << p.
tmp_dir << std::endl;
71 out <<
"min=$SLURM_ARRAY_TASK_MIN" << std::endl;
72 out <<
"max=$SLURM_ARRAY_TASK_MAX" << std::endl;
73 out <<
"cur=$SLURM_ARRAY_TASK_ID" << std::endl;
74 out <<
"tasks=" << p.
tasks << std::endl;
75 out <<
"jobcount=$(( max - min + 1 ))" << std::endl;
76 out <<
"slicesize=$(( (tasks + jobcount + 1) / jobcount ))" << std::endl;
77 out <<
"start=$(( (cur - 1) * slicesize + min ))" << std::endl;
78 out <<
"end=$(( start + slicesize - 1 ))" << std::endl;
81 out <<
"for i in `seq ${start} ${end}`; do" << std::endl;
82 out <<
"\tcmd=$(time sed -n \"${i}p\" < " << p.
filename_jobs <<
")" << std::endl;
83 out <<
"\techo \"Executing $cmd\"" << std::endl;
84 out <<
"\techo \"# START ${i} #\"" << std::endl;
85 out <<
"\techo \"# START ${i} #\" >&2" << std::endl;
86 out <<
"\tstart=`date +\"%s%3N\"`" << std::endl;
87 out <<
"\tulimit -c 0 && ulimit -S -v " << p.
limit_memory.kibi() <<
" && ulimit -S -t " << std::chrono::seconds(timeout).count() <<
" && eval /usr/bin/time -v $cmd ; rc=$?" << std::endl;
88 out <<
"\tend=`date +\"%s%3N\"`" << std::endl;
89 out <<
"\techo \"# END ${i} #\"" << std::endl;
90 out <<
"\techo \"# END ${i} #\" 1>&2" << std::endl;
91 out <<
"\techo \"time: $(( end - start ))\"" << std::endl;
92 out <<
"\techo \"exitcode: $rc\"" << std::endl;
93 out <<
"\techo \"# END DATA ${i} #\"" << std::endl;
94 out <<
"done" << std::endl;
104 std::ofstream out(filename);
105 out <<
"#!/usr/bin/env zsh" << std::endl;
106 out <<
"### Job name" << std::endl;
108 out <<
"#SBATCH --job-name=benchmax" << std::endl;
110 out <<
"#SBATCH -o " << p.
tmp_dir <<
"/JOB.%A_%a.out" << std::endl;
111 out <<
"#SBATCH -e " << p.
tmp_dir <<
"/JOB.%A_%a.err" << std::endl;
115 long minutes = std::chrono::duration_cast<std::chrono::minutes>(timeout * p.
slice_size).count() * 2;
116 minutes = std::min(minutes + 1,
static_cast<long>(60*24));
118 out <<
"#SBATCH -t " << minutes << std::endl;
120 out <<
"#SBATCH --mem-per-cpu=" << p.
limit_memory.mebi() + 1024 <<
"M" << std::endl;
123 out <<
"source ~/load_environment" << std::endl;
125 out <<
"cd " << p.
tmp_dir << std::endl;
128 out <<
"min=$SLURM_ARRAY_TASK_MIN" << std::endl;
129 out <<
"max=$SLURM_ARRAY_TASK_MAX" << std::endl;
130 out <<
"cur=$SLURM_ARRAY_TASK_ID" << std::endl;
131 out <<
"slicesize=" << p.
slice_size << std::endl;
132 out <<
"start=$(( (cur - 1) * slicesize + 1 + " << p.
job_range.first <<
" ))" << std::endl;
133 out <<
"end=$(( start + slicesize - 1 + " << p.
job_range.first <<
" ))" << std::endl;
134 out <<
"end=$((end<" << p.
job_range.second <<
" ? end : " << p.
job_range.second <<
"))" << std::endl;
137 out <<
"for i in `seq ${start} ${end}`; do" << std::endl;
138 out <<
"lineidx=$(( i - " << p.
job_range.first <<
" ))" << std::endl;
139 out <<
"\tcmd=$(time sed -n \"${lineidx}p\" < " << p.
filename_jobs <<
")" << std::endl;
140 out <<
"\techo \"Executing $cmd\"" << std::endl;
141 out <<
"\techo \"# START ${i} #\"" << std::endl;
142 out <<
"\techo \"# START ${i} #\" >&2" << std::endl;
143 out <<
"\tstart=`date +\"%s%3N\"`" << std::endl;
145 out <<
"\tulimit -c 0 && ulimit -S -v " << p.
limit_memory.kibi() <<
" && eval /usr/bin/time -v timeout --signal=TERM --preserve-status " << std::chrono::seconds(timeout).count() <<
"s $cmd ; rc=$?" << std::endl;
146 out <<
"\tend=`date +\"%s%3N\"`" << std::endl;
147 out <<
"\techo \"# END ${i} #\"" << std::endl;
148 out <<
"\techo \"# END ${i} #\" 1>&2" << std::endl;
149 out <<
"\techo \"time: $(( end - start ))\"" << std::endl;
150 out <<
"\techo \"exitcode: $rc\"" << std::endl;
151 out <<
"\techo \"# END DATA ${i} #\"" << std::endl;
152 out <<
"done" << std::endl;
159 std::regex r(
"Submitted batch job ([0-9]+)");
161 if (std::regex_search(output, m, r)) {
163 return std::stoi(m[1]);
171 std::regex re(name +
": (.*)");
173 if (std::regex_search(content, m, re)) {
177 BENCHMAX_LOG_INFO(
"benchmax.slurm",
"Did not find expected information " << name <<
" in " << content);
185 for (
const auto& f: files) {
195 for (
const auto& entry : std::filesystem::directory_iterator(basedir)) {
196 std::filesystem::remove_all(entry.path());
214 std::stringstream cmd;
215 cmd <<
"sacct -o state -j " << jobid;
220 std::istringstream iss(output);
222 std::getline(iss, line);
223 assert(line.find(
"State") != std::string::npos);
224 std::getline(iss, line);
225 assert(line.find(
"----------") != std::string::npos);
226 while (std::getline(iss, line)) {
227 if (line.find(
"COMPLETED") == std::string::npos && line.find(
"CANCELLED") == std::string::npos && line.find(
"TIMEOUT") == std::string::npos) {
#define BENCHMAX_LOG_DEBUG(channel, msg)
Log debug messages.
#define BENCHMAX_LOG_WARN(channel, msg)
Log warnings.
#define BENCHMAX_LOG_TRACE(channel, msg)
Log trace messages.
#define BENCHMAX_LOG_INFO(channel, msg)
Log informational messages.
#define BENCHMAX_LOG_ERROR(channel, msg)
Log errors.
static void remove(V &ts, const T &t)
void clear_directory(const fs::path &basedir)
Clear log files from directory.
std::string generate_submit_file(const SubmitfileProperties &p)
Generate a submit file for Slurm with the given properties.
void remove_log_files(const std::vector< fs::path > &files, bool remove)
Remove the given list of files.
void archive_log_files(const ArchiveProperties &p)
Put all log files into an archive.
std::string generate_submit_file_chunked(const ChunkedSubmitfileProperties &p)
int parse_job_id(const std::string &output)
Parses the job id from the output of sbatch.
std::vector< fs::path > collect_result_files(const fs::path &basedir)
Collects all result files in the given base directory for this job id.
std::string parse_result_info(const std::string &content, const std::string &name)
Parse a single result information from the output.
bool is_job_finished(int jobid)
Checks if the given job is finished.
int call_program(const std::string &commandline, std::string &stdout, bool print_to_stdout=false)
Runs an external program from some command line and records the output to stdout.
All properties needed to archive log files.
std::string filename_archive
Filename of the archive.
std::string tmp_dir
Temporary directory to look for output files.
All properties needed to create a submit file.
carl::settings::duration grace_time
Grace time in seconds.
std::string tmp_dir
Temporary directory for log files.
std::size_t slice_size
Slice size.
carl::settings::binary_quantity limit_memory
Memory limit in megabytes.
carl::settings::duration limit_time
Time limit in seconds.
std::string filename_jobs
Filename of the job list file.
std::string file_suffix
Suffix for job and submit file.
std::pair< std::size_t, std::size_t > job_range
This slice size.
All properties needed to create a submit file.
carl::settings::duration grace_time
Grace time in seconds.
carl::settings::duration limit_time
Time limit in seconds.
std::size_t slices
Number of slices.
std::string file_suffix
Suffix for job and submit file.
std::size_t tasks
Number of tasks.
std::string tmp_dir
Temporary directory for log files.
std::string filename_jobs
Filename of the job list file.
carl::settings::binary_quantity limit_memory
Memory limit in megabytes.