Increase system safety and ensure fans fail in a safe mode
This commit is contained in:
		
							parent
							
								
									c084c42794
								
							
						
					
					
						commit
						77592844fb
					
				
					 1 changed files with 91 additions and 15 deletions
				
			
		
							
								
								
									
										106
									
								
								main.cpp
									
										
									
									
									
								
							
							
						
						
									
										106
									
								
								main.cpp
									
										
									
									
									
								
							| 
						 | 
					@ -71,6 +71,8 @@ double gpu_fan_zone(const std::vector<Sensor>& sensors)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	const char mi50Chip[] = "amdgpu-pci-2300";
 | 
						const char mi50Chip[] = "amdgpu-pci-2300";
 | 
				
			||||||
	const char mi25Chip[] = "amdgpu-pci-4300";
 | 
						const char mi25Chip[] = "amdgpu-pci-4300";
 | 
				
			||||||
 | 
						bool hitMi25 = false;
 | 
				
			||||||
 | 
						bool hitMi50 = false;
 | 
				
			||||||
	const char monitored_sensor_name[] = "edge";
 | 
						const char monitored_sensor_name[] = "edge";
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	double max_temp = std::numeric_limits<double>::min();
 | 
						double max_temp = std::numeric_limits<double>::min();
 | 
				
			||||||
| 
						 | 
					@ -78,32 +80,58 @@ double gpu_fan_zone(const std::vector<Sensor>& sensors)
 | 
				
			||||||
	{
 | 
						{
 | 
				
			||||||
		if((sensor.chip == mi50Chip || sensor.chip == mi25Chip) && sensor.name == monitored_sensor_name)
 | 
							if((sensor.chip == mi50Chip || sensor.chip == mi25Chip) && sensor.name == monitored_sensor_name)
 | 
				
			||||||
		{
 | 
							{
 | 
				
			||||||
 | 
								if(sensor.chip == mi50Chip)
 | 
				
			||||||
 | 
									hitMi50 = true;
 | 
				
			||||||
 | 
								else
 | 
				
			||||||
 | 
									hitMi25 = true;
 | 
				
			||||||
			if(max_temp < sensor.reading)
 | 
								if(max_temp < sensor.reading)
 | 
				
			||||||
				max_temp = sensor.reading;
 | 
									max_temp = sensor.reading;
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	return fan_curve(max_temp, 0.20, 1.0, 45, 75);
 | 
						if(!hitMi50 || !hitMi25)
 | 
				
			||||||
 | 
						{
 | 
				
			||||||
 | 
							std::cerr<<"Could not get temperature from MI25 or MI50! Ramping fans to maximum\n";
 | 
				
			||||||
 | 
							return 1.0;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						else
 | 
				
			||||||
 | 
							return fan_curve(max_temp, 0.20, 1.0, 45, 75);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
double system_fan_zone(const std::vector<Sensor>& sensors)
 | 
					double system_fan_zone(const std::vector<Sensor>& sensors)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	Sensor cpu("IPMI", "CPU Temp");
 | 
						Sensor cpu("IPMI", "CPU Temp");
 | 
				
			||||||
	Sensor system("IPMI", "System Temp");
 | 
						Sensor system("IPMI", "System Temp");
 | 
				
			||||||
 | 
						bool hitCpu = false;
 | 
				
			||||||
 | 
						bool hitSystem = false;
 | 
				
			||||||
	std::vector<double> out;
 | 
						std::vector<double> out;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	for(const Sensor& sensor : sensors)
 | 
						for(const Sensor& sensor : sensors)
 | 
				
			||||||
	{
 | 
						{
 | 
				
			||||||
		if(cpu == sensor)
 | 
							if(cpu == sensor)
 | 
				
			||||||
 | 
							{
 | 
				
			||||||
 | 
								hitCpu = true;
 | 
				
			||||||
			cpu = sensor;
 | 
								cpu = sensor;
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
		else if(sensor == system)
 | 
							else if(sensor == system)
 | 
				
			||||||
 | 
							{
 | 
				
			||||||
 | 
								hitSystem = true;
 | 
				
			||||||
			system = sensor;
 | 
								system = sensor;
 | 
				
			||||||
 | 
							}
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	double fanSystem = fan_curve(system.reading, 0.33, 1.0, 40, 65);
 | 
						if(hitCpu && hitSystem)
 | 
				
			||||||
	double fanCpu = fan_curve(cpu.reading, 0.33, 1.0, 40, 70);
 | 
						{
 | 
				
			||||||
 | 
							double fanSystem = fan_curve(system.reading, 0.33, 1.0, 40, 65);
 | 
				
			||||||
 | 
							double fanCpu = fan_curve(cpu.reading, 0.33, 1.0, 40, 70);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	return std::max(fanSystem, fanCpu);
 | 
							return std::max(fanSystem, fanCpu);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						else
 | 
				
			||||||
 | 
						{
 | 
				
			||||||
 | 
							std::cerr<<"Could not get temperature from System or Cpu! Ramping fans to maximum\n";
 | 
				
			||||||
 | 
							return 1;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
std::vector<double> get_fan_zones(const std::vector<Sensor>& sensors)
 | 
					std::vector<double> get_fan_zones(const std::vector<Sensor>& sensors)
 | 
				
			||||||
| 
						 | 
					@ -114,20 +142,24 @@ std::vector<double> get_fan_zones(const std::vector<Sensor>& sensors)
 | 
				
			||||||
	return out;
 | 
						return out;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
int main(int argc, char **argv)
 | 
					int main_loop()
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	signal(SIGABRT, sig_handler);
 | 
						ipmi_ctx_t raw_ctx = ipmi_open_context();
 | 
				
			||||||
	signal(SIGTERM, sig_handler);
 | 
						if(!raw_ctx)
 | 
				
			||||||
	signal(SIGHUP, sig_handler);
 | 
						{
 | 
				
			||||||
	signal(SIGINT, sig_handler);
 | 
							sensors_cleanup();
 | 
				
			||||||
 | 
							return 1;
 | 
				
			||||||
	if(argc > 1)
 | 
						}
 | 
				
			||||||
		quiet = true;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	int ret = sensors_init(nullptr);
 | 
						int ret = sensors_init(nullptr);
 | 
				
			||||||
	if(ret < 0)
 | 
						if(ret < 0)
 | 
				
			||||||
	{
 | 
						{
 | 
				
			||||||
		std::cerr<<"Could not init lm_sensors\n";
 | 
							std::cerr<<"Could not init lm_sensors\n";
 | 
				
			||||||
 | 
							ipmi_set_fan_group(raw_ctx, 0, 1);
 | 
				
			||||||
 | 
							ipmi_set_fan_group(raw_ctx, 1, 1);
 | 
				
			||||||
 | 
							ipmi_ctx_close(raw_ctx);
 | 
				
			||||||
 | 
							ipmi_ctx_destroy(raw_ctx);
 | 
				
			||||||
 | 
							return 1;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	std::vector<const sensors_chip_name*> lm_chips = lm_get_chips("amdgpu-*");
 | 
						std::vector<const sensors_chip_name*> lm_chips = lm_get_chips("amdgpu-*");
 | 
				
			||||||
| 
						 | 
					@ -135,13 +167,28 @@ int main(int argc, char **argv)
 | 
				
			||||||
	ipmi_sensors.push_back(Sensor("IPMI", "CPU Temp"));
 | 
						ipmi_sensors.push_back(Sensor("IPMI", "CPU Temp"));
 | 
				
			||||||
	ipmi_sensors.push_back(Sensor("IPMI", "System Temp"));
 | 
						ipmi_sensors.push_back(Sensor("IPMI", "System Temp"));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if(lm_chips.size() < 2)
 | 
				
			||||||
 | 
						{
 | 
				
			||||||
 | 
							std::cerr<<"Could not get both monitored gpus!";
 | 
				
			||||||
 | 
							ipmi_set_fan_group(raw_ctx, 0, 1);
 | 
				
			||||||
 | 
							ipmi_set_fan_group(raw_ctx, 1, 1);
 | 
				
			||||||
 | 
							ipmi_ctx_close(raw_ctx);
 | 
				
			||||||
 | 
							ipmi_ctx_destroy(raw_ctx);
 | 
				
			||||||
 | 
							sensors_cleanup();
 | 
				
			||||||
 | 
							return 1;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	ipmi_monitoring_ctx_t monitoring_ctx = init_ipmi_monitoring();
 | 
						ipmi_monitoring_ctx_t monitoring_ctx = init_ipmi_monitoring();
 | 
				
			||||||
	if(!monitoring_ctx)
 | 
						if(!monitoring_ctx)
 | 
				
			||||||
 | 
						{
 | 
				
			||||||
 | 
							ipmi_set_fan_group(raw_ctx, 0, 1);
 | 
				
			||||||
 | 
							ipmi_set_fan_group(raw_ctx, 1, 1);
 | 
				
			||||||
 | 
							ipmi_ctx_close(raw_ctx);
 | 
				
			||||||
 | 
							ipmi_ctx_destroy(raw_ctx);
 | 
				
			||||||
 | 
							sensors_cleanup();
 | 
				
			||||||
		return 1;
 | 
							return 1;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	ipmi_ctx_t raw_ctx = ipmi_open_context();
 | 
					 | 
				
			||||||
	if(!raw_ctx)
 | 
					 | 
				
			||||||
		return 1;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	while(running)
 | 
						while(running)
 | 
				
			||||||
	{
 | 
						{
 | 
				
			||||||
| 
						 | 
					@ -161,10 +208,39 @@ int main(int argc, char **argv)
 | 
				
			||||||
		sleep(10);
 | 
							sleep(10);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						ipmi_set_fan_group(raw_ctx, 0, 1);
 | 
				
			||||||
 | 
						ipmi_set_fan_group(raw_ctx, 1, 1);
 | 
				
			||||||
	ipmi_ctx_close(raw_ctx);
 | 
						ipmi_ctx_close(raw_ctx);
 | 
				
			||||||
	ipmi_ctx_destroy(raw_ctx);
 | 
						ipmi_ctx_destroy(raw_ctx);
 | 
				
			||||||
	ipmi_monitoring_ctx_destroy(monitoring_ctx);
 | 
						ipmi_monitoring_ctx_destroy(monitoring_ctx);
 | 
				
			||||||
	sensors_cleanup();
 | 
						sensors_cleanup();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	return 0;
 | 
						return 0;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					int main (int argc, char **argv)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						signal(SIGABRT, sig_handler);
 | 
				
			||||||
 | 
						signal(SIGTERM, sig_handler);
 | 
				
			||||||
 | 
						signal(SIGHUP, sig_handler);
 | 
				
			||||||
 | 
						signal(SIGINT, sig_handler);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if(argc > 1)
 | 
				
			||||||
 | 
							quiet = true;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						int ret = 0;
 | 
				
			||||||
 | 
						for(size_t i = 0; i < 3; ++i)
 | 
				
			||||||
 | 
						{
 | 
				
			||||||
 | 
							ret = main_loop();
 | 
				
			||||||
 | 
							if(!running)
 | 
				
			||||||
 | 
								break;
 | 
				
			||||||
 | 
							std::cerr<<"Mainloop unable to start, retrying in 10 sec\n";
 | 
				
			||||||
 | 
							sleep(10);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if(ret != 0)
 | 
				
			||||||
 | 
							std::cerr<<"Error not clearing, giveing up\n";
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return ret;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue