Increase system safety and ensure fans fail in a safe mode
This commit is contained in:
parent
c084c42794
commit
77592844fb
98
main.cpp
98
main.cpp
|
@ -71,6 +71,8 @@ double gpu_fan_zone(const std::vector<Sensor>& sensors)
|
||||||
{
|
{
|
||||||
const char mi50Chip[] = "amdgpu-pci-2300";
|
const char mi50Chip[] = "amdgpu-pci-2300";
|
||||||
const char mi25Chip[] = "amdgpu-pci-4300";
|
const char mi25Chip[] = "amdgpu-pci-4300";
|
||||||
|
bool hitMi25 = false;
|
||||||
|
bool hitMi50 = false;
|
||||||
const char monitored_sensor_name[] = "edge";
|
const char monitored_sensor_name[] = "edge";
|
||||||
|
|
||||||
double max_temp = std::numeric_limits<double>::min();
|
double max_temp = std::numeric_limits<double>::min();
|
||||||
|
@ -78,11 +80,21 @@ double gpu_fan_zone(const std::vector<Sensor>& sensors)
|
||||||
{
|
{
|
||||||
if((sensor.chip == mi50Chip || sensor.chip == mi25Chip) && sensor.name == monitored_sensor_name)
|
if((sensor.chip == mi50Chip || sensor.chip == mi25Chip) && sensor.name == monitored_sensor_name)
|
||||||
{
|
{
|
||||||
|
if(sensor.chip == mi50Chip)
|
||||||
|
hitMi50 = true;
|
||||||
|
else
|
||||||
|
hitMi25 = true;
|
||||||
if(max_temp < sensor.reading)
|
if(max_temp < sensor.reading)
|
||||||
max_temp = sensor.reading;
|
max_temp = sensor.reading;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(!hitMi50 || !hitMi25)
|
||||||
|
{
|
||||||
|
std::cerr<<"Could not get temperature from MI25 or MI50! Ramping fans to maximum\n";
|
||||||
|
return 1.0;
|
||||||
|
}
|
||||||
|
else
|
||||||
return fan_curve(max_temp, 0.20, 1.0, 45, 75);
|
return fan_curve(max_temp, 0.20, 1.0, 45, 75);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -90,20 +102,36 @@ double system_fan_zone(const std::vector<Sensor>& sensors)
|
||||||
{
|
{
|
||||||
Sensor cpu("IPMI", "CPU Temp");
|
Sensor cpu("IPMI", "CPU Temp");
|
||||||
Sensor system("IPMI", "System Temp");
|
Sensor system("IPMI", "System Temp");
|
||||||
|
bool hitCpu = false;
|
||||||
|
bool hitSystem = false;
|
||||||
std::vector<double> out;
|
std::vector<double> out;
|
||||||
|
|
||||||
for(const Sensor& sensor : sensors)
|
for(const Sensor& sensor : sensors)
|
||||||
{
|
{
|
||||||
if(cpu == sensor)
|
if(cpu == sensor)
|
||||||
|
{
|
||||||
|
hitCpu = true;
|
||||||
cpu = sensor;
|
cpu = sensor;
|
||||||
|
}
|
||||||
else if(sensor == system)
|
else if(sensor == system)
|
||||||
|
{
|
||||||
|
hitSystem = true;
|
||||||
system = sensor;
|
system = sensor;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if(hitCpu && hitSystem)
|
||||||
|
{
|
||||||
double fanSystem = fan_curve(system.reading, 0.33, 1.0, 40, 65);
|
double fanSystem = fan_curve(system.reading, 0.33, 1.0, 40, 65);
|
||||||
double fanCpu = fan_curve(cpu.reading, 0.33, 1.0, 40, 70);
|
double fanCpu = fan_curve(cpu.reading, 0.33, 1.0, 40, 70);
|
||||||
|
|
||||||
return std::max(fanSystem, fanCpu);
|
return std::max(fanSystem, fanCpu);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
std::cerr<<"Could not get temperature from System or Cpu! Ramping fans to maximum\n";
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<double> get_fan_zones(const std::vector<Sensor>& sensors)
|
std::vector<double> get_fan_zones(const std::vector<Sensor>& sensors)
|
||||||
|
@ -114,20 +142,24 @@ std::vector<double> get_fan_zones(const std::vector<Sensor>& sensors)
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
int main_loop()
|
||||||
{
|
{
|
||||||
signal(SIGABRT, sig_handler);
|
ipmi_ctx_t raw_ctx = ipmi_open_context();
|
||||||
signal(SIGTERM, sig_handler);
|
if(!raw_ctx)
|
||||||
signal(SIGHUP, sig_handler);
|
{
|
||||||
signal(SIGINT, sig_handler);
|
sensors_cleanup();
|
||||||
|
return 1;
|
||||||
if(argc > 1)
|
}
|
||||||
quiet = true;
|
|
||||||
|
|
||||||
int ret = sensors_init(nullptr);
|
int ret = sensors_init(nullptr);
|
||||||
if(ret < 0)
|
if(ret < 0)
|
||||||
{
|
{
|
||||||
std::cerr<<"Could not init lm_sensors\n";
|
std::cerr<<"Could not init lm_sensors\n";
|
||||||
|
ipmi_set_fan_group(raw_ctx, 0, 1);
|
||||||
|
ipmi_set_fan_group(raw_ctx, 1, 1);
|
||||||
|
ipmi_ctx_close(raw_ctx);
|
||||||
|
ipmi_ctx_destroy(raw_ctx);
|
||||||
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<const sensors_chip_name*> lm_chips = lm_get_chips("amdgpu-*");
|
std::vector<const sensors_chip_name*> lm_chips = lm_get_chips("amdgpu-*");
|
||||||
|
@ -135,13 +167,28 @@ int main(int argc, char **argv)
|
||||||
ipmi_sensors.push_back(Sensor("IPMI", "CPU Temp"));
|
ipmi_sensors.push_back(Sensor("IPMI", "CPU Temp"));
|
||||||
ipmi_sensors.push_back(Sensor("IPMI", "System Temp"));
|
ipmi_sensors.push_back(Sensor("IPMI", "System Temp"));
|
||||||
|
|
||||||
|
if(lm_chips.size() < 2)
|
||||||
|
{
|
||||||
|
std::cerr<<"Could not get both monitored gpus!";
|
||||||
|
ipmi_set_fan_group(raw_ctx, 0, 1);
|
||||||
|
ipmi_set_fan_group(raw_ctx, 1, 1);
|
||||||
|
ipmi_ctx_close(raw_ctx);
|
||||||
|
ipmi_ctx_destroy(raw_ctx);
|
||||||
|
sensors_cleanup();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
ipmi_monitoring_ctx_t monitoring_ctx = init_ipmi_monitoring();
|
ipmi_monitoring_ctx_t monitoring_ctx = init_ipmi_monitoring();
|
||||||
if(!monitoring_ctx)
|
if(!monitoring_ctx)
|
||||||
|
{
|
||||||
|
ipmi_set_fan_group(raw_ctx, 0, 1);
|
||||||
|
ipmi_set_fan_group(raw_ctx, 1, 1);
|
||||||
|
ipmi_ctx_close(raw_ctx);
|
||||||
|
ipmi_ctx_destroy(raw_ctx);
|
||||||
|
sensors_cleanup();
|
||||||
return 1;
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
ipmi_ctx_t raw_ctx = ipmi_open_context();
|
|
||||||
if(!raw_ctx)
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
while(running)
|
while(running)
|
||||||
{
|
{
|
||||||
|
@ -161,10 +208,39 @@ int main(int argc, char **argv)
|
||||||
sleep(10);
|
sleep(10);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ipmi_set_fan_group(raw_ctx, 0, 1);
|
||||||
|
ipmi_set_fan_group(raw_ctx, 1, 1);
|
||||||
ipmi_ctx_close(raw_ctx);
|
ipmi_ctx_close(raw_ctx);
|
||||||
ipmi_ctx_destroy(raw_ctx);
|
ipmi_ctx_destroy(raw_ctx);
|
||||||
ipmi_monitoring_ctx_destroy(monitoring_ctx);
|
ipmi_monitoring_ctx_destroy(monitoring_ctx);
|
||||||
sensors_cleanup();
|
sensors_cleanup();
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int main (int argc, char **argv)
|
||||||
|
{
|
||||||
|
signal(SIGABRT, sig_handler);
|
||||||
|
signal(SIGTERM, sig_handler);
|
||||||
|
signal(SIGHUP, sig_handler);
|
||||||
|
signal(SIGINT, sig_handler);
|
||||||
|
|
||||||
|
if(argc > 1)
|
||||||
|
quiet = true;
|
||||||
|
|
||||||
|
int ret = 0;
|
||||||
|
for(size_t i = 0; i < 3; ++i)
|
||||||
|
{
|
||||||
|
ret = main_loop();
|
||||||
|
if(!running)
|
||||||
|
break;
|
||||||
|
std::cerr<<"Mainloop unable to start, retrying in 10 sec\n";
|
||||||
|
sleep(10);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(ret != 0)
|
||||||
|
std::cerr<<"Error not clearing, giveing up\n";
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue