3 # jobd - perform Girocco maintenance jobs
5 # Run with --help for details
12 use POSIX
":sys_wait_h";
15 use lib
"__BASEDIR__";
20 BEGIN {noFatalsToBrowser
}
21 use Girocco
::ExecUtil
;
25 return 4 unless defined($cpus) && $cpus ne "" && int($cpus) >= 1;
26 return int($cpus * 2) if $cpus <= 2;
27 return 5 if $cpus < 4;
28 return int($cpus * 1.5) if $cpus <= 10;
35 my $cpus = online_cpus
;
37 my $max_par = $cpus ? _defjobs
($cpus) : 4;
38 my $max_par_intensive = 1;
39 my $load_triggers = $cpus ?
sprintf("%g,%g", $cpus * 1.5, $cpus * 0.75) : "6,3";
40 my $lockfile = "/tmp/jobd-$Girocco::Config::tmpsuffix.lock";
41 my $restart_delay = 300;
45 my ($update_only, $gc_only, $needs_gc_only);
47 my ($load_trig, $load_untrig);
53 my $p = $job->{'project'};
54 check_project_exists
($job) || return;
55 my $projpath = get_project_path
($p);
56 if ($gc_only || $needs_gc_only ||
57 -e
"$projpath/.nofetch" ||
58 -e
"$projpath/.bypass" ||
59 -e
"$projpath/.bypass_fetch" ||
60 is_mirror_disabled
($p)) {
62 setup_gc
($job) unless ! -e
"$projpath/.nofetch" &&
63 -e
"$projpath/.clone_in_progress" && ! -e
"$projpath/.clone_failed";
66 if (-e
"$projpath/.clone_in_progress" && ! -e
"$projpath/.clone_failed") {
67 job_skip
($job, "initial mirroring not complete yet");
70 if (-e
"$projpath/.clone_failed" || -e
"$projpath/.clone_failed_exceeds_limit") {
71 job_skip
($job, "initial mirroring failed");
72 # Still need to gc clones even if they've failed
76 if (my $ts = is_operation_uptodate
($p, 'lastrefresh', rand_adjust
($Girocco::Config
::min_mirror_interval
))) {
77 job_skip
($job, "not needed right now, last run at $ts");
81 if (is_svn_clone
($p)) {
82 # git svn can be very, very slow at times
83 $job->{'timeout_factor'} = 3;
85 exec_job_command
($job, ["$Girocco::Config::basedir/jobd/update.sh", $p], $quiet);
90 my $p = $job->{'project'};
91 check_project_exists
($job) || return;
92 my $projpath = get_project_path
($p);
93 if ($update_only || -e
"$projpath/.nogc" || -e
"$projpath/.bypass" ||
94 (-e
"$projpath/.delaygc" && ! -e
"$projpath/.allowgc" && ! -e
"$projpath/.needsgc")) {
99 if (! -e
"$projpath/.needsgc" && ($needs_gc_only ||
100 ($ts = is_operation_uptodate
($p, 'lastgc', rand_adjust
($Girocco::Config
::min_gc_interval
))))) {
101 job_skip
($job, ($needs_gc_only ?
undef : "not needed right now, last run at $ts"));
104 # allow garbage collection to run for longer than an update
105 $job->{'lastgc'} = get_git_config
($projpath, "gitweb.lastgc");
106 $job->{'timeout_factor'} = 3;
107 exec_job_command
($job, ["$Girocco::Config::basedir/jobd/gc.sh", $p], $quiet);
113 project
=> $job->{'project'},
115 command
=> \
&gc_project
,
117 on_success
=> \
&maybe_setup_gc_again
,
121 sub maybe_setup_gc_again
{
123 # If lastgc was set then gc.sh ran successfully and now it's not set
124 # then queue up another run of gc.sh for the project.
125 # However, just in case, no matter what happens with the extra
126 # gc.sh run no more "bonus" runs are possible to avoid any loops.
127 # This allows a "mini" gc that triggers a full gc to have the
128 # full gc run as part of the same --all-once run through instead
129 # of waiting. A very good thing for users of the --all-once option.
130 if ($job->{'lastgc'}) {
131 my $projpath = get_project_path
($job->{'project'});
132 get_git_config
($projpath, "gitweb.lastgc") or
134 project
=> $job->{'project'},
136 command
=> \
&gc_project
,
142 sub check_project_exists
{
144 my $p = $job->{'project'};
145 if (! -d get_project_path
($p)) {
146 job_skip
($job, "non-existent project");
152 sub get_project_path
{
153 "$Girocco::Config::reporoot/".shift().".git";
156 my $_last_config_path;
160 $_last_config_path = "";
161 $_last_config_id = "";
166 my ($projdir, $name) = @_;
167 defined($projdir) && -d
$projdir && -f
"$projdir/config" or return undef;
168 my $cf = "$projdir/config";
169 my @stat = stat($cf);
170 @stat && $stat[7] && $stat[9] or return undef;
171 my $id = join(":", $stat[0], $stat[1], $stat[7], $stat[9]); # dev,ino,size,mtime
172 if ($_last_config_path ne $cf || $_last_config_id ne $id || ref($_last_config) ne 'HASH') {
173 my $data = read_config_file_hash
($cf);
174 defined($data) or $data = {};
175 $_last_config_path = $_last_config_id = "";
176 $_last_config = $data;
177 $_last_config_id = $id;
178 $_last_config_path = $cf;
180 return $_last_config->{$name};
183 sub is_operation_uptodate
{
184 my ($project, $which, $threshold) = @_;
185 my $path = get_project_path
($project);
186 my $timestamp = get_git_config
($path, "gitweb.$which");
187 defined($timestamp) or $timestamp = '';
188 my $unix_ts = parse_rfc2822_date
($timestamp) || 0;
189 (time - $unix_ts) <= $threshold ?
$timestamp : undef;
192 sub is_mirror_disabled
{
194 my $path = get_project_path
($project);
195 my $baseurl = get_git_config
($path, 'gitweb.baseurl');
196 defined($baseurl) or $baseurl = '';
197 $baseurl =~ s/^\s+//;
198 $baseurl =~ s/\s+$//;
199 return $baseurl eq "" || $baseurl =~ /\s/ || $baseurl =~ /^disabled(?:\s|$)/i;
204 my $path = get_project_path
($project);
205 my $baseurl = get_git_config
($path, 'gitweb.baseurl');
206 defined($baseurl) or $baseurl = '';
207 my $svnurl = get_git_config
($path, 'svn-remote.svn.url');
208 defined($svnurl) or $svnurl = '';
209 return $baseurl =~ /^svn[:+]/i && $svnurl;
217 command
=> \
&update_project
,
218 on_success
=> \
&setup_gc
,
219 on_error
=> \
&setup_gc
,
224 queue_one
($_) for (Girocco
::Project
->get_full_list());
227 ######### Daemon operation {{{1
237 # Kills and reaps the specified pid. Returns exit status ($?) on success
238 # otherwise undef if process could not be killed or reaped
239 # First sends SIGINT and if process does not exit within 15 seconds then SIGKILL
240 # We used to send SIGTERM instead of SIGINT, but by using SIGINT we can take
241 # advantage of "tee -i" in our update scripts and really anything we're killing
242 # should respond the same to either SIGINT or SIGTERM and exit gracefully.
243 # Usage: my $exitcode = kill_gently($pid, $kill_process_group = 0);
246 my $use_pg = shift || 0;
247 # Note that the docs for Perl's kill state that a negative signal
248 # number should be used to kill process groups and that while a
249 # a negative process id (and positive signal number) may also do that
250 # on some platforms, that's not portable.
251 my $pg = $use_pg ?
-1 : 1;
252 my $harsh = time() + 15; # SIGKILL after this delay
253 my $count = kill(2*$pg, $targ); # SIGINT is 2
254 my $reaped = waitpid($targ, WNOHANG
);
255 return undef if $reaped < 0;
256 return $?
if $reaped == $targ;
257 while ($count && time() < $harsh) {
258 select(undef, undef, undef, 0.2);
259 $reaped = waitpid($targ, WNOHANG
);
260 return undef if $reaped < 0;
261 return $?
if $reaped == $targ;
264 $count = kill(9*$pg, $targ); # SIGKILL is 9
265 $reaped = waitpid($targ, WNOHANG
);
266 return undef if $reaped < 0;
267 return $?
if $reaped == $targ;
268 # We should not need to wait to reap a SIGKILL, however, just in case
269 # the system doesn't make a SIGKILL'd process immediately reapable
270 # (perhaps under extremely heavy load) we accomodate a brief delay
271 while ($count && time() < $harsh) {
272 select(undef, undef, undef, 0.2);
273 $reaped = waitpid($targ, WNOHANG
);
274 return undef if $reaped < 0;
275 return $?
if $reaped == $targ;
280 sub handle_softexit
{
281 error
("Waiting for outstanding jobs to finish... ".
282 "^C again to exit immediately");
285 $SIG{'INT'} = \
&handle_exit
;
289 error
("Killing outstanding jobs, please be patient...");
290 $SIG{'TERM'} = 'IGNORE';
292 kill_gently
($_->{'pid'}, 1);
294 unlink $lockfile if ($locked);
300 $opts{'queued_at'} = time;
301 $opts{'dont_run'} = 0;
302 $opts{'intensive'} = 0 unless exists $opts{'intensive'};
310 $job->{'command'}->($job);
311 if ($job->{'dont_run'}) {
320 "[".$job->{'type'}."::".$job->{'project'}."]";
323 # Only one of those per job!
324 sub exec_job_command
{
325 my ($job, $command, $err_only) = @_;
328 $job->{'finished'} = 0;
329 delete $job->{'pid'};
330 if (!defined($pid = fork)) {
331 error
(_job_name
($job) ." Can't fork job: $!");
332 $job->{'finished'} = 1;
337 select(undef, undef, undef, 0.1);
339 open STDIN
, '<', '/dev/null' || do {
340 error
(_job_name
($job) ."Can't read from /dev/null: $!");
344 open STDOUT
, '>', '/dev/null' || do {
345 error
(_job_name
($job) ." Can't write to /dev/null: $!");
349 # New process group so we can keep track of all of its children
350 if (!defined(POSIX
::setpgid
(0, 0))) {
351 error
(_job_name
($job) ." Can't create process group: $!");
356 # Stop perl from complaining
359 $job->{'pid'} = $pid;
360 $job->{'started_at'} = time;
364 my ($job, $msg) = @_;
365 $job->{'dont_run'} = 1;
366 error
(_job_name
($job) ." Skipping job: $msg") unless $quiet || !$msg;
369 sub reap_hanging_jobs
{
371 my $factor = $_->{'timeout_factor'} || 1;
372 if (defined($_->{'started_at'}) && (time - $_->{'started_at'}) > ($kill_after * $factor)) {
373 $_->{'finished'} = 1;
374 my $exitcode = kill_gently
($_->{'pid'}, 1);
377 error
(_job_name
($_) ." KILLED due to timeout" .
378 (($exitcode & 0x7f) == 9 ?
" with SIGKILL": ""));
379 push @jobs_killed, _job_name
($_);
386 if (!$job->{'finished'}) {
387 $job->{'on_success'}->($job) if defined($job->{'on_success'});
388 $job->{'finished'} = 1;
391 $job->{'on_error'}->($job) if defined($job->{'on_error'});
395 sub reap_finished_jobs
{
397 my $finished_any = 0;
398 foreach my $child (grep { !$_->{'pid'} && $_->{'killed'} } @running) {
399 delete $child->{'killed'};
400 reap_one_job
($child);
404 $pid = waitpid(-1, WNOHANG
);
408 my @child = grep { $_->{'pid'} && $_->{'pid'} == $pid } @running;
410 # any non-zero exit status should trigger on_error
411 $child[0]->{'finished'} = 1 if @child;
414 delete $child[0]->{'pid'};
415 reap_one_job
($child[0]);
418 @running = grep { $_->{'finished'} == 0 } @running;
422 sub have_intensive_jobs
{
423 grep { $_->{'intensive'} == 1 } @running;
427 "[". scalar(localtime) ."] ";
431 if ($^O
eq "linux") {
432 # Read /proc/loadavg on Linux
433 open(LOADAV
, '<', '/proc/loadavg') or return undef;
434 my $loadinfo = <LOADAV
>;
436 return (split(/\s/, $loadinfo, 4))[0..2];
438 # Read the output of uptime everywhere else (works on Linux too)
439 open(LOADAV
, '-|', 'uptime') or return undef;
440 my $loadinfo = <LOADAV
>;
442 $loadinfo =~ /load average[^0-9.]*([0-9.]+)[^0-9.]+([0-9.]+)[^0-9.]+([0-9.]+)/iso or return undef;
449 my $last_progress = $start;
450 my $last_checkload = $start - 5;
451 my $current_load = $load_trig;
458 my $s = @queue == 1 ?
'' : 's';
459 ferror
("--- Processing %d queued job$s", scalar(@queue));
461 $SIG{'INT'} = \
&handle_softexit
;
462 $SIG{'TERM'} = \
&handle_exit
;
463 while (@queue || @running) {
465 my $proceed_immediately = reap_finished_jobs
();
466 # Check current system load
467 if ($load_trig && (time - $last_checkload) >= 5 && defined((my @loadinfo = get_load_info
())[0])) {
468 my $current_load = $loadinfo[0];
469 if ($current_load > $load_trig && !$overloaded) {
471 error
("PAUSE: system load is at $current_load > $load_trig") if $progress;
472 } elsif ($current_load < $load_untrig && $overloaded) {
474 error
("RESUME: system load is at $current_load < $load_untrig") if $progress;
477 $load_info = ', paused (load '. $current_load .')';
479 $load_info = ', load '. $current_load;
481 $last_checkload = time;
484 if ($progress && (time - $last_progress) >= 60) {
485 ferror
("STATUS: %d queued, %d running, %d finished, %d skipped, %d killed$load_info", scalar(@queue), scalar(@running), $jobs_executed, $jobs_skipped, scalar(@jobs_killed));
489 push @run_status, _job_name
($_)." ". (time - $_->{'started_at'}) ."s";
491 error
("STATUS: currently running: ". join(', ', @run_status));
493 $last_progress = time;
495 # Back off if we're too busy
496 if (@running >= $max_par || have_intensive_jobs
() >= $max_par_intensive || !@queue || $overloaded) {
497 sleep 1 unless $proceed_immediately;
501 run_job
(shift(@queue)) if @queue;
504 my $s = $jobs_executed == 1 ?
'' : 's';
505 ferror
("--- Queue processed in %s. %d job$s executed, %d skipped, %d killed.",
506 human_duration
(time - $start), $jobs_executed, $jobs_skipped, scalar(@jobs_killed));
510 sub run_perpetually
{
512 die "Lockfile '$lockfile' exists. Please make sure no other instance of jobd is running.\n";
514 open LOCK
, '>', $lockfile || die "Cannot create lockfile '$lockfile': $!\n";
521 # touch ctime of lockfile to prevent it from being removed by /tmp cleaning
522 chmod 0640, $lockfile;
523 chmod 0644, $lockfile;
524 # check for restart request
525 open LOCK
, '<', $lockfile || die "Lock file '$lockfile' has disappeared!\n";
526 my $request = <LOCK
>;
528 chomp $request if defined($request);
529 if (defined($request) && $request eq "restart") {
535 sleep($restart_delay) if $perpetual; # Let the system breathe for a moment
542 ######### Helpers {{{1
545 print STDERR ts
().shift()."\n";
548 error
(sprintf($_[0], @_[1..$#_]));
557 my $reexec = Girocco
::ExecUtil
->new;
558 my $realpath0 = realpath
($0);
560 close(DATA
) if fileno(DATA
);
562 Getopt
::Long
::Configure
('bundling');
564 my $parse_res = GetOptions
(
566 pod2usage
(-verbose
=> 2, -exitval
=> 0, -input
=> $realpath0)},
567 'quiet|q' => sub {++$quiet},
568 'progress|P' => sub {++$progress},
569 'kill-after|k=i' => \
$kill_after,
570 'max-parallel|p=i' => \
$max_par,
571 'max-intensive-parallel|i=i' => \
$max_par_intensive,
572 'load-triggers=s' => \
$load_triggers,
573 'restart-delay|d=i' => \
$restart_delay,
574 'lockfile|l=s' => \
$lockfile,
575 'same-pid' => \
$same_pid,
576 'all-once|a' => \
$all_once,
577 'one|o=s' => sub {$one_once{$_[1]} = 1, push(@one, $_[1])
578 unless exists $one_once{$_[1]}},
579 'update-only' => \
$update_only,
580 'gc-only' => \
$gc_only,
581 'needs-gc-only' => \
$needs_gc_only,
582 ) || pod2usage
(-exitval
=> 2, -input
=> $realpath0);
583 fatal
("Error: can only use one out of --all-once and --one")
584 if $all_once && @one;
585 my $onlycnt = ($update_only?
1:0) + ($gc_only?
1:0) + ($needs_gc_only?
1:0);
586 fatal
("Error: can only use one out of --update-only, --gc-only and --needs-gc-only")
588 fatal
("Error: --update-only, --gc-only or --needs-gc-only requires --all-once or --one")
589 if $onlycnt && !($all_once || @one);
591 delete $ENV{'show_progress'};
593 $ENV{'show_progress'} = 0 if $quiet > 1;
595 $progress = 1 unless $progress;
596 $ENV{'show_progress'} = $progress;
599 $load_triggers = '0,0' unless defined((get_load_info
())[0]);
600 ($load_trig, $load_untrig) = split(/,/, $load_triggers);
603 queue_one
($_) foreach @one;
615 if (run_perpetually
() eq "restart") {
616 error
("Restarting in response to restart request... ");
617 $reexec->reexec($same_pid);
618 error
("Continuing after failed restart: $!");
624 ########## Documentation {{{1
630 jobd.pl - Perform Girocco maintenance jobs
637 -h | --help detailed instructions
638 -q | --quiet run quietly
639 -P | --progress show occasional status updates
640 -k SECONDS | --kill-after SECONDS how long to wait before killing jobs
641 -p NUM | --max-parallel NUM how many jobs to run at the same time
642 -i NUM | --max-intensive-parallel NUM how many resource-hungry jobs to run
644 --load-triggers TRIG,UNTRIG stop queueing jobs at load above
645 TRIG and resume at load below UNTRIG
646 -d NUM | --restart-delay SECONDS wait for this many seconds between
648 -l FILE | --lockfile FILE create a lockfile in the given
650 --same-pid keep same pid during graceful restart
651 -a | --all-once process the list only once
652 -o PRJNAME | --one PRJNAME process only one project
653 --update-only process mirror updates only
654 --gc-only perform needed garbage collection only
655 --needs-gc-only perform needed mini gc only
663 Print the full description of jobd.pl's options.
667 Suppress non-error messages, e.g. for use when running this task as a cronjob.
668 When given two or more times suppress update ref change lines in logs as well.
672 Show information about the current status of the job queue occasionally. This
673 is automatically enabled if --quiet is not given. When specified two or more
674 times full ref change details will be shown for updates.
676 =item B<--kill-after SECONDS>
678 Kill supervised jobs after a certain time to avoid hanging the daemon.
680 =item B<--max-parallel NUM>
682 Run no more than that many jobs at the same time. The default is the number
683 of cpus * 2 for 1 or 2 cpus, 5 for 3 cpus and int(cpus * 1.5) for 4 cpus or
684 more with the default capped to 16 when more than 10 cpus are detected.
685 If the number of cpus cannot be determined, the default is 4.
687 =item B<--max-intensive-parallel NUM>
689 Run no more than that many resource-hungry jobs at the same time. Right now,
690 this refers to repacking jobs. The default is 1.
692 =item B<--load-triggers TRIG,UNTRIG>
694 If the first system load average (1 minute average) exceeds TRIG, don't queue
695 any more jobs until it goes below UNTRIG. This is currently only supported on
696 Linux and any other platforms that provide an uptime command with load average
699 If both values are zero, load checks are disabled. The default is the number
700 of cpus * 1.5 for TRIG and half that for UNTRIG. If the number of cpus cannot
701 be determined, the default is 6,3.
703 =item B<--restart-delay NUM>
705 After processing the queue, wait this many seconds until the queue is
706 restarted. The default is 300 seconds.
708 =item B<--lockfile FILE>
710 For perpetual operation, specify the full path to a lock file to create and
711 then remove after finishing/aborting. The default is /tmp/jobd-$suffix.lock
712 where $suffix is a 6-character string uniquely determined by the name and
713 nickname of this Girocco instance. The pid of the running jobd instance will
714 be written to the lock file.
718 When performing a graceful restart, keep the same pid rather than switching to
723 Instead of perpetually processing all projects over and over again, process
724 them just once and then exit.
725 Conflicts with B<--one PRJNAME> option.
727 =item B<--one PRJNAME>
729 Process only the given project (given as just the project name without C<.git>
730 suffix) and then exit. May be repeated to process more than one project.
731 Conflicts with B<--all-once> option.
733 =item B<--update-only>
735 Limit processing to only those projects that need a mirror update.
736 Behaves as though every project has a C<.nogc> file present in it.
737 Requires use of B<--all-once> or B<--one PRJNAME> option.
738 Conflicts with B<--gc-only> and B<--needs-gc-only> options.
742 Limit processing to only those projects that need to have garbage collection
743 run on them. Behaves as though every project has a C<.bypass_fetch> file
744 present in it. Requires use of B<--all-once> or B<--one PRJNAME> option.
745 Conflicts with B<--update-only> and B<--needs-gc-only> options.
747 =item B<--needs-gc-only>
749 Limit processing to only those projects that need to have mini garbage
750 collection run on them. Behaves as though every project with a C<.needsgc>
751 file present in it also has a C<.bypass_fetch> file present in it and as though
752 every project without a C<.needsgc> file present in it has a C<.bypass> file
753 present in it. Requires use of B<--all-once> or B<--one PRJNAME> option.
754 Conflicts with B<--update-only> and B<--gc-only> options.
760 jobd.pl is Girocco's repositories maintenance servant; it periodically checks
761 all the repositories and updates mirrored repositories and repacks push-mode
762 repositories when needed.