Increase system safety and ensure fans fail in a safe mode

This commit is contained in:
Carl Philipp Klemm 2023-05-04 14:02:17 +02:00
parent c084c42794
commit 77592844fb

View file

@ -71,6 +71,8 @@ double gpu_fan_zone(const std::vector<Sensor>& sensors)
{ {
const char mi50Chip[] = "amdgpu-pci-2300"; const char mi50Chip[] = "amdgpu-pci-2300";
const char mi25Chip[] = "amdgpu-pci-4300"; const char mi25Chip[] = "amdgpu-pci-4300";
bool hitMi25 = false;
bool hitMi50 = false;
const char monitored_sensor_name[] = "edge"; const char monitored_sensor_name[] = "edge";
double max_temp = std::numeric_limits<double>::min(); double max_temp = std::numeric_limits<double>::min();
@ -78,11 +80,21 @@ double gpu_fan_zone(const std::vector<Sensor>& sensors)
{ {
if((sensor.chip == mi50Chip || sensor.chip == mi25Chip) && sensor.name == monitored_sensor_name) if((sensor.chip == mi50Chip || sensor.chip == mi25Chip) && sensor.name == monitored_sensor_name)
{ {
if(sensor.chip == mi50Chip)
hitMi50 = true;
else
hitMi25 = true;
if(max_temp < sensor.reading) if(max_temp < sensor.reading)
max_temp = sensor.reading; max_temp = sensor.reading;
} }
} }
if(!hitMi50 || !hitMi25)
{
std::cerr<<"Could not get temperature from MI25 or MI50! Ramping fans to maximum\n";
return 1.0;
}
else
return fan_curve(max_temp, 0.20, 1.0, 45, 75); return fan_curve(max_temp, 0.20, 1.0, 45, 75);
} }
@ -90,20 +102,36 @@ double system_fan_zone(const std::vector<Sensor>& sensors)
{ {
Sensor cpu("IPMI", "CPU Temp"); Sensor cpu("IPMI", "CPU Temp");
Sensor system("IPMI", "System Temp"); Sensor system("IPMI", "System Temp");
bool hitCpu = false;
bool hitSystem = false;
std::vector<double> out; std::vector<double> out;
for(const Sensor& sensor : sensors) for(const Sensor& sensor : sensors)
{ {
if(cpu == sensor) if(cpu == sensor)
{
hitCpu = true;
cpu = sensor; cpu = sensor;
}
else if(sensor == system) else if(sensor == system)
{
hitSystem = true;
system = sensor; system = sensor;
} }
}
if(hitCpu && hitSystem)
{
double fanSystem = fan_curve(system.reading, 0.33, 1.0, 40, 65); double fanSystem = fan_curve(system.reading, 0.33, 1.0, 40, 65);
double fanCpu = fan_curve(cpu.reading, 0.33, 1.0, 40, 70); double fanCpu = fan_curve(cpu.reading, 0.33, 1.0, 40, 70);
return std::max(fanSystem, fanCpu); return std::max(fanSystem, fanCpu);
}
else
{
std::cerr<<"Could not get temperature from System or Cpu! Ramping fans to maximum\n";
return 1;
}
} }
std::vector<double> get_fan_zones(const std::vector<Sensor>& sensors) std::vector<double> get_fan_zones(const std::vector<Sensor>& sensors)
@ -114,20 +142,24 @@ std::vector<double> get_fan_zones(const std::vector<Sensor>& sensors)
return out; return out;
} }
int main(int argc, char **argv) int main_loop()
{ {
signal(SIGABRT, sig_handler); ipmi_ctx_t raw_ctx = ipmi_open_context();
signal(SIGTERM, sig_handler); if(!raw_ctx)
signal(SIGHUP, sig_handler); {
signal(SIGINT, sig_handler); sensors_cleanup();
return 1;
if(argc > 1) }
quiet = true;
int ret = sensors_init(nullptr); int ret = sensors_init(nullptr);
if(ret < 0) if(ret < 0)
{ {
std::cerr<<"Could not init lm_sensors\n"; std::cerr<<"Could not init lm_sensors\n";
ipmi_set_fan_group(raw_ctx, 0, 1);
ipmi_set_fan_group(raw_ctx, 1, 1);
ipmi_ctx_close(raw_ctx);
ipmi_ctx_destroy(raw_ctx);
return 1;
} }
std::vector<const sensors_chip_name*> lm_chips = lm_get_chips("amdgpu-*"); std::vector<const sensors_chip_name*> lm_chips = lm_get_chips("amdgpu-*");
@ -135,13 +167,28 @@ int main(int argc, char **argv)
ipmi_sensors.push_back(Sensor("IPMI", "CPU Temp")); ipmi_sensors.push_back(Sensor("IPMI", "CPU Temp"));
ipmi_sensors.push_back(Sensor("IPMI", "System Temp")); ipmi_sensors.push_back(Sensor("IPMI", "System Temp"));
if(lm_chips.size() < 2)
{
std::cerr<<"Could not get both monitored gpus!";
ipmi_set_fan_group(raw_ctx, 0, 1);
ipmi_set_fan_group(raw_ctx, 1, 1);
ipmi_ctx_close(raw_ctx);
ipmi_ctx_destroy(raw_ctx);
sensors_cleanup();
return 1;
}
ipmi_monitoring_ctx_t monitoring_ctx = init_ipmi_monitoring(); ipmi_monitoring_ctx_t monitoring_ctx = init_ipmi_monitoring();
if(!monitoring_ctx) if(!monitoring_ctx)
{
ipmi_set_fan_group(raw_ctx, 0, 1);
ipmi_set_fan_group(raw_ctx, 1, 1);
ipmi_ctx_close(raw_ctx);
ipmi_ctx_destroy(raw_ctx);
sensors_cleanup();
return 1; return 1;
}
ipmi_ctx_t raw_ctx = ipmi_open_context();
if(!raw_ctx)
return 1;
while(running) while(running)
{ {
@ -161,10 +208,39 @@ int main(int argc, char **argv)
sleep(10); sleep(10);
} }
ipmi_set_fan_group(raw_ctx, 0, 1);
ipmi_set_fan_group(raw_ctx, 1, 1);
ipmi_ctx_close(raw_ctx); ipmi_ctx_close(raw_ctx);
ipmi_ctx_destroy(raw_ctx); ipmi_ctx_destroy(raw_ctx);
ipmi_monitoring_ctx_destroy(monitoring_ctx); ipmi_monitoring_ctx_destroy(monitoring_ctx);
sensors_cleanup(); sensors_cleanup();
return 0; return 0;
} }
int main (int argc, char **argv)
{
signal(SIGABRT, sig_handler);
signal(SIGTERM, sig_handler);
signal(SIGHUP, sig_handler);
signal(SIGINT, sig_handler);
if(argc > 1)
quiet = true;
int ret = 0;
for(size_t i = 0; i < 3; ++i)
{
ret = main_loop();
if(!running)
break;
std::cerr<<"Mainloop unable to start, retrying in 10 sec\n";
sleep(10);
}
if(ret != 0)
std::cerr<<"Error not clearing, giveing up\n";
return ret;
}