SMT-RAT  24.02
Toolbox for Strategic and Parallel Satisfiability-Modulo-Theories Solving
SlurmUtilities.cpp
Go to the documentation of this file.
1 #include "SlurmUtilities.h"
2 
3 #include <regex>
4 #include <sstream>
5 
7 
8 namespace benchmax {
9 namespace slurm {
10 
12  std::string output;
13  std::stringstream ss;
14  ss << "tar --force-local -czf " << p.filename_archive << " ";
15  ss << "-C " << p.tmp_dir << " ";
16  ss << "`ls " << p.tmp_dir << "`";
17  BENCHMAX_LOG_DEBUG("benchmax.slurm", "Archiving log files with command " << ss.str());
18  int code = call_program(ss.str(), output);
19  if (code == 0) {
20  BENCHMAX_LOG_INFO("benchmax.slurm", "Archived log files in " << p.filename_archive << " from " << p.tmp_dir);
21  } else {
22  BENCHMAX_LOG_WARN("benchmax.slurm", "Archiving of log files failed with exit code " << code);
23  BENCHMAX_LOG_WARN("benchmax.slurm", output);
24  }
25 }
26 
27 std::vector<fs::path> collect_result_files(const fs::path& basedir) {
28  BENCHMAX_LOG_DEBUG("benchmax.slurm", "Collecting results files from " << basedir);
29  std::vector<fs::path> files;
30 
31  std::regex filenamere("JOB.[0-9]+_[0-9]+.(out|err)");
32  for (const auto& f: fs::directory_iterator(basedir)) {
33  if (!std::regex_match(f.path().filename().string(), filenamere)) {
34  BENCHMAX_LOG_TRACE("benchmax.slurm", "Skipping file " << f.path());
35  continue;
36  }
37  BENCHMAX_LOG_DEBUG("benchmax.slurm", "Using file " << f.path());
38  files.emplace_back(f.path());
39  }
40  BENCHMAX_LOG_INFO("benchmax.slurm", "Collected " << files.size() << " log files.");
41  return files;
42 }
43 
45  std::string filename = p.tmp_dir + "/job-" + p.file_suffix + ".job";
46  BENCHMAX_LOG_INFO("benchmax.slurm", "Generating submitfile " << filename);
47  std::ofstream out(filename);
48  out << "#!/usr/bin/env zsh" << std::endl;
49  out << "### Job name" << std::endl;
50  // Job name
51  out << "#SBATCH --job-name=benchmax" << std::endl;
52  // Output files (stdout and stderr)
53  out << "#SBATCH -o " << p.tmp_dir << "/JOB.%A_%a.out" << std::endl;
54  out << "#SBATCH -e " << p.tmp_dir << "/JOB.%A_%a.err" << std::endl;
55 
56  // Rough estimation of time in minutes (timeout * jobs)
57  auto timeout = (std::chrono::seconds(p.limit_time) + std::chrono::seconds(p.grace_time));
58  long minutes = std::chrono::duration_cast<std::chrono::minutes>(timeout * p.tasks / p.slices).count() * 2;
59  minutes = std::min(minutes, static_cast<long>(60*24));
60 
61  out << "#SBATCH -t " << minutes << std::endl;
62  // Memory usage in MB
63  out << "#SBATCH --mem-per-cpu=" << p.limit_memory.mebi() + 1024 << "M" << std::endl;
64 
65  // Load environment
66  out << "source ~/load_environment" << std::endl;
67  // Change current directory
68  out << "cd " << p.tmp_dir << std::endl;
69 
70  // Calculate slices for jobfile
71  out << "min=$SLURM_ARRAY_TASK_MIN" << std::endl;
72  out << "max=$SLURM_ARRAY_TASK_MAX" << std::endl;
73  out << "cur=$SLURM_ARRAY_TASK_ID" << std::endl;
74  out << "tasks=" << p.tasks << std::endl;
75  out << "jobcount=$(( max - min + 1 ))" << std::endl;
76  out << "slicesize=$(( (tasks + jobcount + 1) / jobcount ))" << std::endl;
77  out << "start=$(( (cur - 1) * slicesize + min ))" << std::endl;
78  out << "end=$(( start + slicesize - 1 ))" << std::endl;
79 
80  // Execute this slice
81  out << "for i in `seq ${start} ${end}`; do" << std::endl;
82  out << "\tcmd=$(time sed -n \"${i}p\" < " << p.filename_jobs << ")" << std::endl;
83  out << "\techo \"Executing $cmd\"" << std::endl;
84  out << "\techo \"# START ${i} #\"" << std::endl;
85  out << "\techo \"# START ${i} #\" >&2" << std::endl;
86  out << "\tstart=`date +\"%s%3N\"`" << std::endl;
87  out << "\tulimit -c 0 && ulimit -S -v " << p.limit_memory.kibi() << " && ulimit -S -t " << std::chrono::seconds(timeout).count() << " && eval /usr/bin/time -v $cmd ; rc=$?" << std::endl;
88  out << "\tend=`date +\"%s%3N\"`" << std::endl;
89  out << "\techo \"# END ${i} #\"" << std::endl;
90  out << "\techo \"# END ${i} #\" 1>&2" << std::endl;
91  out << "\techo \"time: $(( end - start ))\"" << std::endl;
92  out << "\techo \"exitcode: $rc\"" << std::endl;
93  out << "\techo \"# END DATA ${i} #\"" << std::endl;
94  out << "done" << std::endl;
95  out.close();
96 
97  return filename;
98 }
99 
100 
102  std::string filename = p.tmp_dir + "/job-" + p.file_suffix + ".job";
103  BENCHMAX_LOG_INFO("benchmax.slurm", "Generating submitfile " << filename);
104  std::ofstream out(filename);
105  out << "#!/usr/bin/env zsh" << std::endl;
106  out << "### Job name" << std::endl;
107  // Job name
108  out << "#SBATCH --job-name=benchmax" << std::endl;
109  // Output files (stdout and stderr)
110  out << "#SBATCH -o " << p.tmp_dir << "/JOB.%A_%a.out" << std::endl;
111  out << "#SBATCH -e " << p.tmp_dir << "/JOB.%A_%a.err" << std::endl;
112 
113  // Rough estimation of time in minutes (timeout * slice_size)
114  auto timeout = (std::chrono::seconds(p.limit_time) + std::chrono::seconds(p.grace_time));
115  long minutes = std::chrono::duration_cast<std::chrono::minutes>(timeout * p.slice_size).count() * 2;
116  minutes = std::min(minutes + 1, static_cast<long>(60*24));
117 
118  out << "#SBATCH -t " << minutes << std::endl;
119  // Memory usage in MB
120  out << "#SBATCH --mem-per-cpu=" << p.limit_memory.mebi() + 1024 << "M" << std::endl;
121 
122  // Load environment
123  out << "source ~/load_environment" << std::endl;
124  // Change current directory
125  out << "cd " << p.tmp_dir << std::endl;
126 
127  // Calculate slices for jobfile
128  out << "min=$SLURM_ARRAY_TASK_MIN" << std::endl;
129  out << "max=$SLURM_ARRAY_TASK_MAX" << std::endl;
130  out << "cur=$SLURM_ARRAY_TASK_ID" << std::endl;
131  out << "slicesize=" << p.slice_size << std::endl;
132  out << "start=$(( (cur - 1) * slicesize + 1 + " << p.job_range.first << " ))" << std::endl;
133  out << "end=$(( start + slicesize - 1 + " << p.job_range.first << " ))" << std::endl;
134  out << "end=$((end<" << p.job_range.second << " ? end : " << p.job_range.second << "))" << std::endl;
135 
136  // Execute this slice
137  out << "for i in `seq ${start} ${end}`; do" << std::endl;
138  out << "lineidx=$(( i - " << p.job_range.first << " ))" << std::endl;
139  out << "\tcmd=$(time sed -n \"${lineidx}p\" < " << p.filename_jobs << ")" << std::endl;
140  out << "\techo \"Executing $cmd\"" << std::endl;
141  out << "\techo \"# START ${i} #\"" << std::endl;
142  out << "\techo \"# START ${i} #\" >&2" << std::endl;
143  out << "\tstart=`date +\"%s%3N\"`" << std::endl;
144  // out << "\tulimit -c 0 && ulimit -S -v " << p.limit_memory.kibi() << " && ulimit -S -t " << std::chrono::seconds(timeout).count() << " && eval /usr/bin/time -v $cmd ; rc=$?" << std::endl;
145  out << "\tulimit -c 0 && ulimit -S -v " << p.limit_memory.kibi() << " && eval /usr/bin/time -v timeout --signal=TERM --preserve-status " << std::chrono::seconds(timeout).count() << "s $cmd ; rc=$?" << std::endl;
146  out << "\tend=`date +\"%s%3N\"`" << std::endl;
147  out << "\techo \"# END ${i} #\"" << std::endl;
148  out << "\techo \"# END ${i} #\" 1>&2" << std::endl;
149  out << "\techo \"time: $(( end - start ))\"" << std::endl;
150  out << "\techo \"exitcode: $rc\"" << std::endl;
151  out << "\techo \"# END DATA ${i} #\"" << std::endl;
152  out << "done" << std::endl;
153  out.close();
154 
155  return filename;
156 }
157 
158 int parse_job_id(const std::string& output) {
159  std::regex r("Submitted batch job ([0-9]+)");
160  std::smatch m;
161  if (std::regex_search(output, m, r)) {
162  BENCHMAX_LOG_DEBUG("benchmax.slurm", "Job ID is " << m[1]);
163  return std::stoi(m[1]);
164  } else {
165  BENCHMAX_LOG_ERROR("benchmax.slurm", "Unable to obtain job id from slurm output.");
166  return -1;
167  }
168 }
169 
170 std::string parse_result_info(const std::string& content, const std::string& name) {
171  std::regex re(name + ": (.*)");
172  std::smatch m;
173  if (std::regex_search(content, m, re)) {
174  BENCHMAX_LOG_TRACE("benchmax.slurm", "Retrieved " << name << " = " << m[1]);
175  return m[1];
176  } else {
177  BENCHMAX_LOG_INFO("benchmax.slurm", "Did not find expected information " << name << " in " << content);
178  return "";
179  }
180 }
181 
182 void remove_log_files(const std::vector<fs::path>& files, bool remove) {
183  if (remove) {
184  BENCHMAX_LOG_INFO("benchmax.slurm", "Removing log files.");
185  for (const auto& f: files) {
187  }
188  } else {
189  BENCHMAX_LOG_INFO("benchmax.slurm", "Retaining log files.");
190  }
191 }
192 
193 void clear_directory(const fs::path& basedir) {
194  BENCHMAX_LOG_INFO("benchmax.slurm", "Clear directory " << basedir);
195  for (const auto& entry : std::filesystem::directory_iterator(basedir)) {
196  std::filesystem::remove_all(entry.path());
197  }
198  BENCHMAX_LOG_INFO("benchmax.slurm", "Cleared");
199  // std::vector<fs::path> files;
200  // std::regex filenamere("JOB.[0-9]+_[0-9]+.(out|err)");
201  // for (const auto& f: fs::directory_iterator(basedir)) {
202  // if (!std::regex_match(f.path().filename().string(), filenamere)) {
203  // continue;
204  // }
205  // files.emplace_back(f.path());
206  // }
207  // for (const auto& f: files) {
208  // std::filesystem::remove(f);
209  // }
210  // BENCHMAX_LOG_INFO("benchmax.slurm", "Cleared " << files.size() << " log files.");
211 }
212 
213 bool is_job_finished(int jobid) {
214  std::stringstream cmd;
215  cmd << "sacct -o state -j " << jobid;
216  BENCHMAX_LOG_DEBUG("benchmax.slurm", "Command: " << cmd.str());
217  std::string output;
218  call_program(cmd.str(), output, false);
219 
220  std::istringstream iss(output);
221  std::string line;
222  std::getline(iss, line);
223  assert(line.find("State") != std::string::npos);
224  std::getline(iss, line);
225  assert(line.find("----------") != std::string::npos);
226  while (std::getline(iss, line)) {
227  if (line.find("COMPLETED") == std::string::npos && line.find("CANCELLED") == std::string::npos && line.find("TIMEOUT") == std::string::npos) {
228  return false;
229  }
230  }
231  return true;
232 }
233 
234 }
235 }
#define BENCHMAX_LOG_DEBUG(channel, msg)
Log debug messages.
Definition: logging.h:55
#define BENCHMAX_LOG_WARN(channel, msg)
Log warnings.
Definition: logging.h:51
#define BENCHMAX_LOG_TRACE(channel, msg)
Log trace messages.
Definition: logging.h:57
#define BENCHMAX_LOG_INFO(channel, msg)
Log informational messages.
Definition: logging.h:53
#define BENCHMAX_LOG_ERROR(channel, msg)
Log errors.
Definition: logging.h:49
static void remove(V &ts, const T &t)
Definition: Alg.h:36
void clear_directory(const fs::path &basedir)
Clear log files from directory.
std::string generate_submit_file(const SubmitfileProperties &p)
Generate a submit file for Slurm with the given properties.
void remove_log_files(const std::vector< fs::path > &files, bool remove)
Remove the given list of files.
void archive_log_files(const ArchiveProperties &p)
Put all log files into an archive.
std::string generate_submit_file_chunked(const ChunkedSubmitfileProperties &p)
int parse_job_id(const std::string &output)
Parses the job id from the output of sbatch.
std::vector< fs::path > collect_result_files(const fs::path &basedir)
Collects all result files in the given base directory for this job id.
std::string parse_result_info(const std::string &content, const std::string &name)
Parse a single result information from the output.
bool is_job_finished(int jobid)
Checks if the given job is finished.
int call_program(const std::string &commandline, std::string &stdout, bool print_to_stdout=false)
Runs an external program from some command line and records the output to stdout.
Definition: execute.h:18
All properties needed to archive log files.
std::string filename_archive
Filename of the archive.
std::string tmp_dir
Temporary directory to look for output files.
All properties needed to create a submit file.
carl::settings::duration grace_time
Grace time in seconds.
std::string tmp_dir
Temporary directory for log files.
carl::settings::binary_quantity limit_memory
Memory limit in megabytes.
carl::settings::duration limit_time
Time limit in seconds.
std::string filename_jobs
Filename of the job list file.
std::string file_suffix
Suffix for job and submit file.
std::pair< std::size_t, std::size_t > job_range
This slice size.
All properties needed to create a submit file.
carl::settings::duration grace_time
Grace time in seconds.
carl::settings::duration limit_time
Time limit in seconds.
std::size_t slices
Number of slices.
std::string file_suffix
Suffix for job and submit file.
std::size_t tasks
Number of tasks.
std::string tmp_dir
Temporary directory for log files.
std::string filename_jobs
Filename of the job list file.
carl::settings::binary_quantity limit_memory
Memory limit in megabytes.