diff --git a/Makefile b/Makefile index 6732443..b82ac93 100644 --- a/Makefile +++ b/Makefile @@ -103,6 +103,7 @@ install-bin: $(INSTALL_DATA) lib/ntp.pm "$(DESTDIR)$(LIBDIR)/ntp.pm" $(INSTALL_DATA) lib/nut.pm "$(DESTDIR)$(LIBDIR)/nut.pm" $(INSTALL_DATA) lib/nvidia.pm "$(DESTDIR)$(LIBDIR)/nvidia.pm" + $(INSTALL_DATA) lib/nvidiagpu.pm "$(DESTDIR)$(LIBDIR)/nvidiagpu.pm" $(INSTALL_DATA) lib/nvme.pm "$(DESTDIR)$(LIBDIR)/nvme.pm" $(INSTALL_DATA) lib/pagespeed.pm "$(DESTDIR)$(LIBDIR)/pagespeed.pm" $(INSTALL_DATA) lib/pgsql.pm "$(DESTDIR)$(LIBDIR)/pgsql.pm" diff --git a/lib/nvidiagpu.pm b/lib/nvidiagpu.pm new file mode 100644 index 0000000..0bb1b73 --- /dev/null +++ b/lib/nvidiagpu.pm @@ -0,0 +1,717 @@ +# +# Monitorix - A lightweight system monitoring tool. +# +# Copyright (C) 2005-2021 by Jordi Sanfeliu +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# + +package nvidiagpu; + +use strict; +use warnings; +use Monitorix; +use RRDs; +use Cwd 'abs_path'; +use File::Basename; +use Exporter 'import'; +our @EXPORT = qw(nvidiagpu_init nvidiagpu_update nvidiagpu_cgi); + +my $max_number_of_gpus = 8; # Changing this number destroys history. +my $number_of_values_per_gpu_in_rrd = 14; # Changing this number destroys history. + +sub nvidiagpu_init { + my $myself = (caller(0))[3]; + my ($package, $config, $debug) = @_; + my $rrd = $config->{base_lib} . $package . ".rrd"; + my $nvidiagpu = $config->{nvidiagpu}; + + my $info; + my @ds; + my @rra; + my @tmp; + my $n; + + my @average; + my @min; + my @max; + my @last; + + # checks if 'nvidia-smi' does exists. + if(!open(IN, "nvidia-smi |")) { + logger("$myself: unable to execute 'nvidia-smi'. $!"); + return; + } + close(IN); + + if(-e $rrd) { + my $rrd_n_gpu = 0; + my $rrd_n_gpu_times_n_values = 0; + $info = RRDs::info($rrd); + for my $key (keys %$info) { + if(index($key, 'ds[') == 0) { + if(index($key, '.type') != -1) { + push(@ds, substr($key, 3, index($key, ']') - 3)); + } + if(index($key, '_val0].index') != -1) { + $rrd_n_gpu += 1; + } + if(index($key, '.index') != -1) { + $rrd_n_gpu_times_n_values += 1; + } + } + if(index($key, 'rra[') == 0) { + if(index($key, '.rows') != -1) { + push(@rra, substr($key, 4, index($key, ']') - 4)); + } + } + } + if(scalar(@ds) / $rrd_n_gpu_times_n_values != keys(%{$nvidiagpu->{list}})) { + logger("$myself: Detected size mismatch between ... (" . keys(%{$nvidiagpu->{list}}) . ") and $rrd (" . scalar(@ds) / $rrd_n_gpu_times_n_values . "). Resizing it accordingly. All historical data will be lost. Backup file created."); + rename($rrd, "$rrd.bak"); + } + if($rrd_n_gpu < $max_number_of_gpus) { + logger("$myself: Detected size mismatch between max_number_of_gpus (" . $max_number_of_gpus . ") and $rrd (" . $rrd_n_gpu . "). Resizing it accordingly. All historical data will be lost. Backup file created."); + rename($rrd, "$rrd.bak"); + } + if($rrd_n_gpu_times_n_values / $rrd_n_gpu < $number_of_values_per_gpu_in_rrd) { + logger("$myself: Detected size mismatch between number_of_values_per_gpu_in_rrd (" . $number_of_values_per_gpu_in_rrd . ") and $rrd (" . ($rrd_n_gpu_times_n_values / $rrd_n_gpu) . "). Resizing it accordingly. All historical data will be lost. Backup file created."); + rename($rrd, "$rrd.bak"); + } + if(scalar(@rra) < 12 + (4 * $config->{max_historic_years})) { + logger("$myself: Detected size mismatch between 'max_historic_years' (" . $config->{max_historic_years} . ") and $rrd (" . ((scalar(@rra) -12) / 4) . "). Resizing it accordingly. All historical data will be lost. Backup file created."); + rename($rrd, "$rrd.bak"); + } + } + + if(!(-e $rrd)) { + logger("Creating '$rrd' file."); + for($n = 1; $n <= $config->{max_historic_years}; $n++) { + push(@average, "RRA:AVERAGE:0.5:1440:" . (365 * $n)); + push(@min, "RRA:MIN:0.5:1440:" . (365 * $n)); + push(@max, "RRA:MAX:0.5:1440:" . (365 * $n)); + push(@last, "RRA:LAST:0.5:1440:" . (365 * $n)); + } + for($n = 0; $n < keys(%{$nvidiagpu->{list}}); $n++) { + for(my $n_gpu = 0; $n_gpu < $max_number_of_gpus; $n_gpu++) { + for(my $n_sensor = 0; $n_sensor < $number_of_values_per_gpu_in_rrd; $n_sensor++) { + push(@tmp, "DS:nv" . $n . "_gpu" . $n_gpu . "_val" . $n_sensor . ":GAUGE:120:0:U"); + } + } + } + eval { + RRDs::create($rrd, + "--step=60", + @tmp, + "RRA:AVERAGE:0.5:1:1440", + "RRA:AVERAGE:0.5:30:336", + "RRA:AVERAGE:0.5:60:744", + @average, + "RRA:MIN:0.5:1:1440", + "RRA:MIN:0.5:30:336", + "RRA:MIN:0.5:60:744", + @min, + "RRA:MAX:0.5:1:1440", + "RRA:MAX:0.5:30:336", + "RRA:MAX:0.5:60:744", + @max, + "RRA:LAST:0.5:1:1440", + "RRA:LAST:0.5:30:336", + "RRA:LAST:0.5:60:744", + @last, + ); + }; + my $err = RRDs::error; + if($@ || $err) { + logger("$@") unless !$@; + if($err) { + logger("ERROR: while creating $rrd: $err"); + if($err eq "RRDs::error") { + logger("... is the RRDtool Perl package installed?"); + } + } + return; + } + } + + # check dependencies + if(lc($nvidiagpu->{alerts}->{coretemp_enabled} || "") eq "y") { + if(! -x $nvidiagpu->{alerts}->{coretemp_script}) { + logger("$myself: ERROR: script '$nvidiagpu->{alerts}->{coretemp_script}' doesn't exist or don't has execution permissions."); + } + } + if(lc($nvidiagpu->{alerts}->{memorytemp_enabled} || "") eq "y") { + if(! -x $nvidiagpu->{alerts}->{memorytemp_script}) { + logger("$myself: ERROR: script '$nvidiagpu->{alerts}->{memorytemp_script}' doesn't exist or don't has execution permissions."); + } + } + + $config->{nvidiagpu_hist_alert1} = (); + $config->{nvidiagpu_hist_alert2} = (); + push(@{$config->{func_update}}, $package); + logger("$myself: Ok") if $debug; +} + +sub nvidiagpu_update { + my $myself = (caller(0))[3]; + my ($package, $config, $debug) = @_; + my $rrd = $config->{base_lib} . $package . ".rrd"; + my $nvidiagpu = $config->{nvidiagpu}; + my $use_nan_for_missing_data = lc($nvidiagpu->{use_nan_for_missing_data} || "") eq "y" ? 1 : 0; + + my @sensors; + + my $n; + my $rrdata = "N"; + + foreach my $k (sort keys %{$nvidiagpu->{list}}) { + # values delimitted by ", " (comma + space) + my @gpu_group = split(', ', $nvidiagpu->{list}->{$k}); + for($n = 0; $n < $max_number_of_gpus; $n++) { + @sensors = ($use_nan_for_missing_data ? (0+"nan") : 0) x $number_of_values_per_gpu_in_rrd; + + if($gpu_group[$n]) { + my $str = trim($gpu_group[$n] || ""); + + open(IN, "nvidia-smi --format=csv,noheader,nounits -i $str --query-gpu=clocks.current.graphics,clocks.current.memory,utilization.gpu,utilization.memory,temperature.gpu,temperature.memory,fan.speed,pstate,power.draw,power.limit,memory.used,memory.total |"); + while() { + my @tmp = split(',', $_); + if(scalar(@tmp) > 1) { # To catch missing devices + for(my $n_sensor = 0; $n_sensor < scalar(@tmp); $n_sensor += 1) { + my $val = trim($tmp[$n_sensor]); + if($val ne "N/A") { + if(substr($val, 0, 1) eq "P") { + $val = substr($val, 1); + } + $val =~ tr/,//d; + $sensors[$n_sensor] = trim($val); + chomp($sensors[$n_sensor]); + } + } + $sensors[10] = $sensors[10] / $sensors[11] + } + } + close(IN); + } + + foreach(@sensors) { + $rrdata .= ":$_"; + } + + # nvidiagpu alert + if(lc($nvidiagpu->{alerts}->{coretemp_enabled}) eq "y") { + my $sensorIndex = 1; + $config->{nvidiagpu_hist_alert1}->{$n} = 0 if(!$config->{nvidiagpu_hist_alert1}->{$n}); + if($sensors[$sensorIndex] >= $nvidiagpu->{alerts}->{coretemp_threshold} && $config->{nvidiagpu_hist_alert1}->{$n} < $sensors[$sensorIndex]) { + if(-x $nvidiagpu->{alerts}->{coretemp_script}) { + logger("$myself: ALERT: executing script '$nvidiagpu->{alerts}->{coretemp_script}'."); + system($nvidiagpu->{alerts}->{coretemp_script} . " " .$nvidiagpu->{alerts}->{coretemp_timeintvl} . " " . $nvidiagpu->{alerts}->{coretemp_threshold} . " " . $sensors[$sensorIndex]); + } else { + logger("$myself: ERROR: script '$nvidiagpu->{alerts}->{coretemp_script}' doesn't exist or don't has execution permissions."); + } + $config->{nvidiagpu_hist_alert1}->{$n} = $sensors[$sensorIndex]; + } + } + if(lc($nvidiagpu->{alerts}->{memorytemp_enabled}) eq "y") { + my $sensorIndex = 2; + $config->{nvidiagpu_hist_alert2}->{$n} = 0 if(!$config->{nvidiagpu_hist_alert2}->{$n}); + if($sensors[$sensorIndex] >= $nvidiagpu->{alerts}->{memorytemp_threshold} && $config->{nvidiagpu_hist_alert2}->{$n} < $sensors[$sensorIndex]) { + if(-x $nvidiagpu->{alerts}->{memorytemp_script}) { + logger("$myself: ALERT: executing script '$nvidiagpu->{alerts}->{memorytemp_script}'."); + system($nvidiagpu->{alerts}->{memorytemp_script} . " " .$nvidiagpu->{alerts}->{memorytemp_timeintvl} . " " . $nvidiagpu->{alerts}->{memorytemp_threshold} . " " . $sensors[$sensorIndex]); + } else { + logger("$myself: ERROR: script '$nvidiagpu->{alerts}->{memorytemp_script}' doesn't exist or don't has execution permissions."); + } + $config->{nvidiagpu_hist_alert2}->{$n} = $sensors[$sensorIndex]; + } + } + } + } + + RRDs::update($rrd, $rrdata); + logger("$myself: $rrdata") if $debug; + my $err = RRDs::error; + logger("ERROR: while updating $rrd: $err") if $err; +} + +sub nvidiagpu_cgi { + my ($package, $config, $cgi) = @_; + my @output; + + my $nvidiagpu = $config->{nvidiagpu}; + my @rigid = split(',', ($nvidiagpu->{rigid} || "")); + my @limit = split(',', ($nvidiagpu->{limit} || "")); + my $tf = $cgi->{tf}; + my $colors = $cgi->{colors}; + my $graph = $cgi->{graph}; + my $silent = $cgi->{silent}; + my $zoom = "--zoom=" . $config->{global_zoom}; + my %rrd = ( + 'new' => \&RRDs::graphv, + 'old' => \&RRDs::graph, + ); + my $version = "new"; + my $pic; + my $picz; + my $picz_width; + my $picz_height; + + my $u = ""; + my $width; + my $height; + my @extra; + my @riglim; + my @IMG; + my @IMGz; + my @tmp; + my @tmpz; + my @CDEF; + my $n; + my $n2; + my $e; + my $e2; + my $str; + my $err; + my @LC = ( + "#FFA500", + "#44EEEE", + "#44EE44", + "#4444EE", + "#448844", + "#EE4444", + "#EE44EE", + "#EEEE44", + ); + + my $number_of_sensor_values_in_use = 11; + if($number_of_sensor_values_in_use > $number_of_values_per_gpu_in_rrd) { + logger(@output, "ERROR: Number of sensor values (" . $number_of_sensor_values_in_use . ") has smaller or equal to number of sensor values in rrd (" . $number_of_values_per_gpu_in_rrd . ")!"); + return; + } + my $show_current_values = lc($nvidiagpu->{show_current_values} || "") eq "y" ? 1 : 0; + + $version = "old" if $RRDs::VERSION < 1.3; + my $rrd = $config->{base_lib} . $package . ".rrd"; + my $title = $config->{graph_title}->{$package}; + my $IMG_DIR = $config->{base_dir} . "/" . $config->{imgs_dir}; + my $imgfmt_uc = uc($config->{image_format}); + my $imgfmt_lc = lc($config->{image_format}); + foreach my $i (split(',', $config->{rrdtool_extra_options} || "")) { + push(@extra, trim($i)) if trim($i); + } + + $title = !$silent ? $title : ""; + my $gap_on_all_nan = lc($nvidiagpu->{gap_on_all_nan} || "") eq "y" ? 1 : 0; + + # text mode + # + if(lc($config->{iface_mode}) eq "text") { + if($title) { + push(@output, main::graph_header($title, 2)); + push(@output, " \n"); + push(@output, " \n"); + } + my (undef, undef, undef, $data) = RRDs::fetch("$rrd", + "--resolution=$tf->{res}", + "--start=-$tf->{nwhen}$tf->{twhen}", + "AVERAGE"); + $err = RRDs::error; + push(@output, "ERROR: while fetching $rrd: $err\n") if $err; + my $line1; + my $line2; + my $line3; + push(@output, "
\n");
+		foreach my $k (sort keys %{$nvidiagpu->{list}}) {
+			# values delimitted by ", " (comma + space)
+			my @d = split(', ', $nvidiagpu->{list}->{$k});
+			for($n = 0; $n < scalar(@d); $n++) {
+				$str = sprintf(" NVIDIAgpu %d               ", $n + 1);
+				$line1 .= $str;
+				$str = sprintf(" Sensor values ");
+				$line2 .= $str;
+				$line3 .=      "----------------------";
+			}
+		}
+		push(@output, "     $line1\n");
+		push(@output, "Time $line2\n");
+		push(@output, "-----$line3\n");
+		my $line;
+		my @row;
+		my $time;
+		my $from;
+		my $to;
+		for($n = 0, $time = $tf->{tb}; $n < ($tf->{tb} * $tf->{ts}); $n++) {
+			$line = @$data[$n];
+			$time = $time - (1 / $tf->{ts});
+			push(@output, sprintf(" %2d$tf->{tc} ", $time));
+			$e = 0;
+			foreach my $k (sort keys %{$nvidiagpu->{list}}) {
+				# values delimitted by ", " (comma + space)
+				my @d = split(', ', $nvidiagpu->{list}->{$k});
+				for($n2 = 0; $n2 < scalar(@d); $n2++) {
+					$from = ($e * $max_number_of_gpus * $number_of_values_per_gpu_in_rrd) + ($n2 * $number_of_values_per_gpu_in_rrd);
+					$to = $from + 3;
+					my @sensor_values = @$line[$from..$to];
+					@row = (celsius_to($config, $sensor_values[0]), @sensor_values[1, -1]);
+					my $format_string = "%7.0f" x scalar(@row);
+					push(@output, sprintf(" " . $format_string. " ", @row));
+				}
+				$e++;
+			}
+			push(@output, "\n");
+		}
+		push(@output, "    
\n"); + if($title) { + push(@output, " \n"); + push(@output, " \n"); + push(@output, main::graph_footer()); + } + push(@output, "
\n"); + return @output; + } + + + # graph mode + # + if($silent eq "yes" || $silent eq "imagetag") { + $colors->{fg_color} = "#000000"; # visible color for text mode + $u = "_"; + } + if($silent eq "imagetagbig") { + $colors->{fg_color} = "#000000"; # visible color for text mode + $u = ""; + } + + for($n = 0; $n < keys(%{$nvidiagpu->{list}}); $n++) { + for($n2 = 0; $n2 < $number_of_sensor_values_in_use; $n2++) { + $str = $u . $package . $n . $n2 . "." . $tf->{when} . ".$imgfmt_lc"; + push(@IMG, $str); + unlink("$IMG_DIR" . $str); + if(lc($config->{enable_zoom}) eq "y") { + $str = $u . $package . $n . $n2 . "z." . $tf->{when} . ".$imgfmt_lc"; + push(@IMGz, $str); + unlink("$IMG_DIR" . $str); + } + } + } + + # Plot settings in order of the sensor array. + + my $temperature_unit = lc($config->{temperature_scale}) eq "f" ? "Fahrenheit" : "Celsius"; + my $temperature_scaling = lc($config->{temperature_scale}) eq "f" ? ",9,*,5,/,32,+" : ""; + + my @y_axis_titles_per_plot = ( + "Percent (%)", + $temperature_unit, + $temperature_unit, + "Percent (%)",, + "Watt", + "Percent (%)", + "Percent (%)", + "Hz", + "Hz", + "P" + ); + my @value_transformations_per_sensor = ( + ",1000000,*", + ",1000000,*", + "", + "", + $temperature_scaling, + $temperature_scaling, + "", + "", + "", + "", + ",100,*" + ); + my @legend_labels_per_sensor = ( + "%4.2lf%s", + "%4.2lf%s", + "%3.0lf%%", + "%3.0lf%%", + "%3.1lf", + "%3.1lf", + "%3.1lf%%", + "%1.0lf", + "%5.0lf%s", + "%5.0lf%s", + "%3.1lf%%" + ); + + my @graphs_per_plot = (6, 4, 5, 10, [8, 9], 2, 3, 0, 1, 7); # To rearange the graphs + my $main_sensor_plots = 4; # Number of sensor plots on the left side. + my @main_plots_with_average = (1, 1, 1, 1); # Wether or not the main plots show average, min and max or only the last value in the legend. + + my $number_of_plots = scalar(@graphs_per_plot); + + if(scalar(@y_axis_titles_per_plot) != $number_of_plots) { + push(@output, "ERROR: Size of y_axis_titles_per_plot (" . scalar(@y_axis_titles_per_plot) . ") has to be equal to number_of_plots (" . $number_of_plots . ")"); + } + if(scalar(@value_transformations_per_sensor) != $number_of_sensor_values_in_use) { + push(@output, "ERROR: Size of value_transformations_per_sensor (" . scalar(@value_transformations_per_sensor) . ") has to be equal to number_of_sensor_values_in_use (" . $number_of_sensor_values_in_use . ")"); + } + if(scalar(@legend_labels_per_sensor) != $number_of_sensor_values_in_use) { + push(@output, "ERROR: Size of legend_labels_per_sensor (" . scalar(@legend_labels_per_sensor) . ") has to be equal to number_of_sensor_values_in_use (" . $number_of_sensor_values_in_use . ")"); + } + if(scalar(@graphs_per_plot) >= $number_of_sensor_values_in_use) { + push(@output, "ERROR: Size of graphs_per_plot (" . scalar(@graphs_per_plot) . ") has to be smaller than number_of_sensor_values_in_use (" . $number_of_sensor_values_in_use . ")"); + } + if(scalar(@main_plots_with_average) != $main_sensor_plots) { + push(@output, "ERROR: Size of main_plots_with_average (" . scalar(@main_plots_with_average) . ") has to be equal to main_sensor_plots (" . $main_sensor_plots . ")"); + } + + $e = 0; + foreach my $k (sort keys %{$nvidiagpu->{list}}) { + # values delimitted by ", " (comma + space) + my @d = split(', ', $nvidiagpu->{list}->{$k}); + if($e) { + push(@output, "
\n"); + } + if($title) { + push(@output, main::graph_header($title, 2)); + push(@output, " \n"); + push(@output, " \n"); + } + for(my $n_graph = 0, my $n_plot = 0; $n_graph < $number_of_sensor_values_in_use; $n_graph += 1, $n_plot += 1) { + if($title && $n_plot == $main_sensor_plots) { + push(@output, " \n"); + push(@output, " \n"); + } + + if($n_graph > scalar(@graphs_per_plot)) { + push(@output, "ERROR: n_graph (" . $n_graph . ") has to smaller than size of graphs_per_plot (" . scalar(@graphs_per_plot) . ")"); + } + my $n_sensor; + my $n_sensor2; + if (ref($graphs_per_plot[$n_graph]) eq 'ARRAY') { + $n_sensor = $graphs_per_plot[$n_plot]->[0]; + $n_sensor2 = $graphs_per_plot[$n_plot]->[1]; + $n_graph += 1 + } else { + $n_sensor = $graphs_per_plot[$n_plot]; + } + + @riglim = @{setup_riglim($rigid[$n_plot], $limit[$n_plot])}; + undef(@tmp); + undef(@tmpz); + undef(@CDEF); + if($n_plot < $main_sensor_plots) { + push(@tmp, "COMMENT: \\n"); + } + for($n = 0; $n < $max_number_of_gpus; $n += 1) { + if($d[$n]) { + my $dstr = trim($d[$n]); + my $base = ""; + $dstr =~ s/^\"//; + $dstr =~ s/\"$//; + + # $dstr =~ s/^(.+?) .*$/$1/; + if($base && defined($nvidiagpu->{map}->{$base})) { + $dstr = $nvidiagpu->{map}->{$base}; + } else { + if(defined($nvidiagpu->{map}->{$dstr})) { + $dstr = $nvidiagpu->{map}->{$dstr}; + } + } + if($n_plot < $main_sensor_plots) { + if($main_plots_with_average[$n_plot]) { + $str = sprintf("%-20s", $dstr); + } else { + $str = sprintf("%-57s", $dstr); + } + } else { + if($show_current_values) { + $str = sprintf("%-13s", substr($dstr, 0, 13)); + } else { + $str = sprintf("%-19s", substr($dstr, 0, 19)); + } + } + + my $value_name = "gpu" . $n . "_val" . $n_sensor; + my $value_name2; + push(@tmp, "LINE2:trans_" . $value_name . $LC[$n] . ":$str" . ($n_plot < $main_sensor_plots ? "" : ( $show_current_values ? "\\: \\g" : (($n%2 || !$d[$n+1]) ? "\\n" : "")))); + push(@tmpz, "LINE2:trans_" . $value_name . $LC[$n] . ":$dstr"); + + if ($n_sensor2) { + $value_name2 = "gpu" . $n . "_val" . $n_sensor2; + push(@tmp, "LINE2:trans_" . $value_name2 . $LC[$n] . "BB" . ":dashes=1,3:"); + push(@tmpz, "LINE2:trans_" . $value_name2 . $LC[$n] . "BB" . ":dashes=1,3:"); + } + + if($n_plot < $main_sensor_plots) { + if($main_plots_with_average[$n_plot]) { + push(@tmp, "GPRINT:trans_" . $value_name . ":LAST: Current\\: " . $legend_labels_per_sensor[$n_sensor]); + push(@tmp, "GPRINT:trans_" . $value_name . ":AVERAGE: Average\\: " . $legend_labels_per_sensor[$n_sensor]); + push(@tmp, "GPRINT:trans_" . $value_name . ":MIN: Min\\: " . $legend_labels_per_sensor[$n_sensor]); + push(@tmp, "GPRINT:trans_" . $value_name . ":MAX: Max\\: " . $legend_labels_per_sensor[$n_sensor] . "\\n"); + } else { + push(@tmp, "GPRINT:trans_" . $value_name . ":LAST: Current\\: " . $legend_labels_per_sensor[$n_sensor] . "\\n"); + } + } else { + if($show_current_values) { + if($n_sensor2 && $value_name2) { + push(@tmp, "GPRINT:trans_" . $value_name . ":LAST:" . $legend_labels_per_sensor[$n_sensor] . "\\g"); + push(@tmp, "GPRINT:trans_" . $value_name2 . ":LAST: /" . $legend_labels_per_sensor[$n_sensor2] . " (actual/limit)\\n"); + } else { + push(@tmp, "GPRINT:trans_" . $value_name . ":LAST:" . $legend_labels_per_sensor[$n_sensor] . (($n%2 || !$d[$n+1]) ? "\\n" : "")); + } + } + } + } + } + + if($n_plot < $main_sensor_plots) { + push(@tmp, "COMMENT: \\n"); + if(scalar(@d) && (scalar(@d) % 2)) { + push(@tmp, "COMMENT: \\n"); + } + } + + for(my $n_gpu = 0; $n_gpu < $max_number_of_gpus; $n_gpu++) { + my $value_name = "gpu" . $n_gpu . "_val" . $n_sensor; + push(@CDEF, "CDEF:trans_" . $value_name . "=" . $value_name . $value_transformations_per_sensor[$n_sensor]); + if ($n_sensor2) { + my $value_name2 = "gpu" . $n_gpu . "_val" . $n_sensor2; + push(@CDEF, "CDEF:trans_" . $value_name2 . "=" . $value_name2 . $value_transformations_per_sensor[$n_sensor2]); + } + } + if(lc($config->{show_gaps}) eq "y") { + push(@tmp, "AREA:wrongdata#$colors->{gap}:"); + push(@tmpz, "AREA:wrongdata#$colors->{gap}:"); + push(@CDEF, "CDEF:wrongdata=allvalues,UN,INF,UNKN,IF"); + } + ($width, $height) = split('x', $config->{graph_size}->{($n_plot < $main_sensor_plots) ? 'main' : 'small'}); + if($silent =~ /imagetag/) { + ($width, $height) = split('x', $config->{graph_size}->{remote}) if $silent eq "imagetag"; + ($width, $height) = split('x', $config->{graph_size}->{main}) if $silent eq "imagetagbig"; + @tmp = @tmpz; + push(@tmp, "COMMENT: \\n"); + push(@tmp, "COMMENT: \\n"); + push(@tmp, "COMMENT: \\n"); + } + if ($n_plot >= $main_sensor_plots) { + $height *= 1.6 + } + + my @def_sensor_average; + my $cdef_sensor_allvalues = "CDEF:allvalues="; + for(my $n_gpu = 0; $n_gpu < $max_number_of_gpus; $n_gpu++) { + my $value_name = "gpu" . $n_gpu . "_val" . $n_sensor; + push(@def_sensor_average, "DEF:" . $value_name . "=$rrd:nv" . $e . "_" . $value_name . ":AVERAGE"); + if($n_sensor2) { + my $value_name2 = "gpu" . $n_gpu . "_val" . $n_sensor2; + push(@def_sensor_average, "DEF:" . $value_name2 . "=$rrd:nv" . $e . "_" . $value_name2 . ":AVERAGE"); + } + + if($n_gpu != 0) { + $cdef_sensor_allvalues .= ","; + } + if ($gap_on_all_nan) { + $cdef_sensor_allvalues .= $value_name . ",UN,0,1,IF"; + } else { + $cdef_sensor_allvalues .= $value_name; + } + } + $cdef_sensor_allvalues .= ",+" x ($max_number_of_gpus - 1); + if ($gap_on_all_nan) { + $cdef_sensor_allvalues .= ",0,GT,1,UNKN,IF"; + } + my $plot_title = $config->{graphs}->{'_nvidiagpu' . ($n_plot + 1)}; + $pic = $rrd{$version}->("$IMG_DIR" . "$IMG[$e * 3 + $n_plot]", + "--title=$plot_title ($tf->{nwhen}$tf->{twhen})", + "--start=-$tf->{nwhen}$tf->{twhen}", + "--imgformat=$imgfmt_uc", + "--vertical-label=" . $y_axis_titles_per_plot[$n_plot], + "--width=$width", + "--height=$height", + @extra, + @riglim, + $zoom, + @{$cgi->{version12}}, + $n_plot < $main_sensor_plots ? () : @{$cgi->{version12_small}}, + @{$colors->{graph_colors}}, + @def_sensor_average, + $cdef_sensor_allvalues, + @CDEF, + @tmp); + $err = RRDs::error; + push(@output, "ERROR: while graphing $IMG_DIR" . "$IMG[$e * 3 + $n_plot]: $err\n") if $err; + if(lc($config->{enable_zoom}) eq "y") { + ($width, $height) = split('x', $config->{graph_size}->{zoom}); + $picz = $rrd{$version}->("$IMG_DIR" . "$IMGz[$e * 3 + $n_plot]", + "--title=$plot_title ($tf->{nwhen}$tf->{twhen})", + "--start=-$tf->{nwhen}$tf->{twhen}", + "--imgformat=$imgfmt_uc", + "--vertical-label=" . $y_axis_titles_per_plot[$n_plot], + "--width=$width", + "--height=$height", + @extra, + @riglim, + $zoom, + @{$cgi->{version12}}, + $n_plot < $main_sensor_plots ? () : @{$cgi->{version12_small}}, + @{$colors->{graph_colors}}, + @def_sensor_average, + $cdef_sensor_allvalues, + @CDEF, + @tmpz); + $err = RRDs::error; + push(@output, "ERROR: while graphing $IMG_DIR" . "$IMGz[$e * 3 + $n_plot]: $err\n") if $err; + } + $e2 = $e + $n_plot + 1; + if($title || ($silent =~ /imagetag/ && $graph =~ /nvidiagpu$e2/)) { + if(lc($config->{enable_zoom}) eq "y") { + if(lc($config->{disable_javascript_void}) eq "y") { + push(@output, " {url} . "/" . $config->{imgs_dir} . $IMGz[$e * 3 + $n_plot] . "\">\n"); + } else { + if($version eq "new") { + $picz_width = $picz->{image_width} * $config->{global_zoom}; + $picz_height = $picz->{image_height} * $config->{global_zoom}; + } else { + $picz_width = $width + 115; + $picz_height = $height + 100; + } + push(@output, " {url} . "/" . $config->{imgs_dir} . $IMGz[$e * 3 + $n_plot] . "','','width=" . $picz_width . ",height=" . $picz_height . ",scrollbars=0,resizable=0'))\">\n"); + } + } else { + push(@output, " \n"); + } + } + } + + if($title) { + push(@output, " \n"); + push(@output, " \n"); + + if($nvidiagpu->{desc}->{$k}) { + push(@output, " \n"); + push(@output, " \n"); + push(@output, " \n"); + push(@output, " \n"); + push(@output, "   $nvidiagpu->{desc}->{$k}\n"); + push(@output, " \n"); + push(@output, " \n"); + push(@output, " \n"); + } + push(@output, main::graph_footer()); + } + $e++; + } + push(@output, "
\n"); + return @output; +} + +1; diff --git a/man/man5/monitorix.conf.5 b/man/man5/monitorix.conf.5 index dc66726..8ae0466 100644 --- a/man/man5/monitorix.conf.5 +++ b/man/man5/monitorix.conf.5 @@ -1542,6 +1542,165 @@ This option, when enabled via \fIy\fP, combined with the \fIshow_gaps\fP option .P Default value: \fIn\fP .RE +.SS NVIDIA GPU temperatures and usage (nvidiagpu.pm) +This graph is able to monitor an unlimited number of Nvidia GPUs via \fInvidia-smi\fP. +.P +.BI list +.RS +This is a list of groups of GPUs that you want to monitor. Each group will become a graph and there may be an unlimited number of groups. +.P +WARNING: Every time the number of groups in this option changes, Monitorix will resize the \fInvidiagpu.rrd\fP file accordingly, removing all historical data. +.P +As identifier you can either use the GPU ID or the GPU UUID found via \fInvidia-smi --list-gpus\fP. You can add it to the group 0 like this: +.P +.RS + +.br + 0 = 0, GPU-531b3e21-2fa4-1254-1215-2361f2d345ef +.br + 1 = 2 +.br + +.RE +.P +The maximum number of GPUs allowed per group is 8. +.RE +.P +.BI map +.RS +This list complements the \fBlist\fP option. It basically allows you to change the GPU name that will appear in the graph, hiding the real device name. If no association is defined, then Monitorix will display the name of the GPU device as it is. +.P +.RS + +.br + 0 = RTX 3090 +.br + GPU-531b3e21-2fa4-1254-1215-2361f2d345ef = RTX 3080 +.br + 2 = RTX 3080 Ti +.br + +.RE +.RE +.P +.BI desc +.RS +This list complements the \fBlist\fP option. It basically allows you to include a title for every group of GPUs. The title will appear in the title of the graph. +.P +.RS + +.br + 0 = Host +.br + 1 = Virtual +.br + +.RE +.RE +.P +.BI coretemp_enabled +.RS +This section enables or disables one of the alert capabilities for this graph; the alert for the core temperature. It works as follows: +.P +If the core temperature of any of the specified GPU device names reaches or subceeds the \fBcoretemp_threshold\fP (the interval of time is not used here), Monitorix will execute the external alert script defined in \fBcoretemp_script\fP. +.P +The default Monitorix installation includes an example of a shell-script alert called \fBmonitorix-alert.sh\fP which you can use as a base for your own script. +.P +Default value: \fIn\fP +.RE +.P +.BI coretemp_timeintvl +.RS +Not used in this alert. +.P +Default value: \fI0\fP +.RE +.P +.BI coretemp_threshold +.RS +This is the value that needs to be reached or subceeded to trigger the mechanism for a particular action, which in this case is the execution of an external alert script. +.P +Default value: \fI10\fP +.RE +.P +.BI coretemp_script +.RS +This is the full path name of the script that will be executed by this alert. +.P +It will receive the following three parameters: +.P +1st - the value currently defined in \fBcoretemp_timeintvl\fP. +.br +2nd - the value currently defined in \fBcoretemp_threshold\fP. +.br +3rd - the current core temperature. +.P +Default value: \fI/path/to/script.sh\fP +.RE +.P +.BI memorytemp_enabled +.RS +This section enables or disables one of the alert capabilities for this graph; the alert for the memory temperature. It works as follows: +.P +If the memory temperature of any of the specified GPU names reaches or exceeds the \fBmemorytemp_threshold\fP (the interval of time is not used here), Monitorix will execute the external alert script defined in \fBmemorytemp_script\fP. +.P +The default Monitorix installation includes an example of a shell-script alert called \fBmonitorix-alert.sh\fP which you can use as a base for your own script. +.P +Default value: \fIn\fP +.RE +.P +.BI memorytemp_timeintvl +.RS +Not used in this alert. +.P +Default value: \fI0\fP +.RE +.P +.BI memorytemp_threshold +.RS +This is the value that needs to be reached or exceeded to trigger the mechanism for a particular action, which in this case is the execution of an external alert script. +.P +Default value: \fI90\fP +.RE +.P +.BI memorytemp_script +.RS +This is the full path name of the script that will be executed by this alert. +.P +It will receive the following three parameters: +.P +1st - the value currently defined in \fBmemorytemp_timeintvl\fP. +.br +2nd - the value currently defined in \fBmemorytemp_threshold\fP. +.br +3rd - the current memory temperature. +.P +Default value: \fI/path/to/script.sh\fP +.RE +.P +.BI show_current_values +.RS +.P +Print current values in the legend of the small righthand plots. +.P +Default value: \fIn\fP +.RE +.P +.BI use_nan_for_missing_data +.RS +This option, when enabled via \fIy\fP, shows \fnan\fP values for missing data instead of \f0\fP. This is useful when \f0\fP could be mistaken for valid data. +.P +Default value: \fIn\fP +.RE +.P +.RE +.RE +.BI gap_on_all_nan +.RS +This option, when enabled via \fIy\fP, combined with the \fIshow_gaps\fP option shows gaps only if all data points are \fInan\fP instead of requiring only one to be \fInan\fP for a gap. This can be useful if not all sensor data are required for normal operation. +.P +Default value: \fIn\fP +.RE .SS NVIDIA temperatures and usage (nvidia.pm) This graph requires to have installed the official NVIDIA drivers. .P diff --git a/monitorix.conf b/monitorix.conf index 9a9fb3f..dd4fe3a 100644 --- a/monitorix.conf +++ b/monitorix.conf @@ -85,6 +85,7 @@ secure_log_date_format = %b %e ipmi = n ambsens = n amdgpu = n + nvidiagpu = n nvidia = n disk = n nvme = n @@ -331,6 +332,33 @@ secure_log_date_format = %b %e +# NVIDIAgpu graph +# ----------------------------------------------------------------------------- + + + 0 = 0, 1 + + rigid = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + limit = 100, 100, 100, 100, 100, 100, 100, 100, 100, 100 + use_nan_for_missing_data = y + gap_on_all_nan = y + show_current_values = y + + 0 = RTX 3090 + 1 = RTX 3080 + + + coretemp_enabled = n + coretemp_timeintvl = 0 + coretemp_threshold = 1 + coretemp_script = /path/to/script.sh + memorytemp_enabled = n + memorytemp_timeintvl = 0 + memorytemp_threshold = 1 + memorytemp_script = /path/to/script.sh + + + # NVIDIA graph # ----------------------------------------------------------------------------- @@ -1041,7 +1069,7 @@ logo_bottom = logo_bot.png remote = 300x100 -graph_name = system, kern, proc, hptemp, lmsens, gensens, ipmi, ambsens, amdgpu, nvidia, disk, nvme, fs, zfs, du, net, netstat, tinyproxy, tc, libvirt, process, serv, mail, port, user, ftp, apache, nginx, lighttpd, mysql, pgsql, mongodb, varnish, pagespeed, squid, nfss, nfsc, bind, unbound, ntp, chrony, fail2ban, icecast, raspberrypi, phpapc, memcached, redis, phpfpm, apcupsd, nut, wowza, int, verlihub +graph_name = system, kern, proc, hptemp, lmsens, gensens, ipmi, ambsens, amdgpu, nvidiagpu, nvidia, disk, nvme, fs, zfs, du, net, netstat, tinyproxy, tc, libvirt, process, serv, mail, port, user, ftp, apache, nginx, lighttpd, mysql, pgsql, mongodb, varnish, pagespeed, squid, nfss, nfsc, bind, unbound, ntp, chrony, fail2ban, icecast, raspberrypi, phpapc, memcached, redis, phpfpm, apcupsd, nut, wowza, int, verlihub system = System load average and usage @@ -1053,6 +1081,7 @@ graph_name = system, kern, proc, hptemp, lmsens, gensens, ipmi, ambsens, amdgpu, ipmi = IPMI sensor statistics ambsens = Ambient sensor statistics amdgpu = AMD GPU temperatures and usage + nvidiagpu = NVIDIA GPU temperatures and usage nvidia = NVIDIA temperatures and usage disk = Disk drive temperatures and health nvme = NVMe drive temperatures and health @@ -1130,6 +1159,16 @@ graph_name = system, kern, proc, hptemp, lmsens, gensens, ipmi, ambsens, amdgpu, _amdgpu8 = Core clock _amdgpu9 = Memory clock _amdgpu10 = Memory usage + _nvidiagpu1 = Fan speed + _nvidiagpu2 = Core temperature + _nvidiagpu3 = Memory temperature + _nvidiagpu4 = Memory usage + _nvidiagpu5 = Power + _nvidiagpu6 = Core util. + _nvidiagpu7 = Memory util. + _nvidiagpu8 = Core clock + _nvidiagpu9 = Memory clock + _nvidiagpu10 = Performance state _nvidia1 = NVIDIA temperatures _nvidia2 = CPU usage _nvidia3 = Memory usage