Increase system safety and ensure fans fail in a safe mode
This commit is contained in:
		
							parent
							
								
									c084c42794
								
							
						
					
					
						commit
						77592844fb
					
				
					 1 changed files with 91 additions and 15 deletions
				
			
		
							
								
								
									
										106
									
								
								main.cpp
									
										
									
									
									
								
							
							
						
						
									
										106
									
								
								main.cpp
									
										
									
									
									
								
							| 
						 | 
				
			
			@ -71,6 +71,8 @@ double gpu_fan_zone(const std::vector<Sensor>& sensors)
 | 
			
		|||
{
 | 
			
		||||
	const char mi50Chip[] = "amdgpu-pci-2300";
 | 
			
		||||
	const char mi25Chip[] = "amdgpu-pci-4300";
 | 
			
		||||
	bool hitMi25 = false;
 | 
			
		||||
	bool hitMi50 = false;
 | 
			
		||||
	const char monitored_sensor_name[] = "edge";
 | 
			
		||||
 | 
			
		||||
	double max_temp = std::numeric_limits<double>::min();
 | 
			
		||||
| 
						 | 
				
			
			@ -78,32 +80,58 @@ double gpu_fan_zone(const std::vector<Sensor>& sensors)
 | 
			
		|||
	{
 | 
			
		||||
		if((sensor.chip == mi50Chip || sensor.chip == mi25Chip) && sensor.name == monitored_sensor_name)
 | 
			
		||||
		{
 | 
			
		||||
			if(sensor.chip == mi50Chip)
 | 
			
		||||
				hitMi50 = true;
 | 
			
		||||
			else
 | 
			
		||||
				hitMi25 = true;
 | 
			
		||||
			if(max_temp < sensor.reading)
 | 
			
		||||
				max_temp = sensor.reading;
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return fan_curve(max_temp, 0.20, 1.0, 45, 75);
 | 
			
		||||
	if(!hitMi50 || !hitMi25)
 | 
			
		||||
	{
 | 
			
		||||
		std::cerr<<"Could not get temperature from MI25 or MI50! Ramping fans to maximum\n";
 | 
			
		||||
		return 1.0;
 | 
			
		||||
	}
 | 
			
		||||
	else
 | 
			
		||||
		return fan_curve(max_temp, 0.20, 1.0, 45, 75);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
double system_fan_zone(const std::vector<Sensor>& sensors)
 | 
			
		||||
{
 | 
			
		||||
	Sensor cpu("IPMI", "CPU Temp");
 | 
			
		||||
	Sensor system("IPMI", "System Temp");
 | 
			
		||||
	bool hitCpu = false;
 | 
			
		||||
	bool hitSystem = false;
 | 
			
		||||
	std::vector<double> out;
 | 
			
		||||
 | 
			
		||||
	for(const Sensor& sensor : sensors)
 | 
			
		||||
	{
 | 
			
		||||
		if(cpu == sensor)
 | 
			
		||||
		{
 | 
			
		||||
			hitCpu = true;
 | 
			
		||||
			cpu = sensor;
 | 
			
		||||
		}
 | 
			
		||||
		else if(sensor == system)
 | 
			
		||||
		{
 | 
			
		||||
			hitSystem = true;
 | 
			
		||||
			system = sensor;
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	double fanSystem = fan_curve(system.reading, 0.33, 1.0, 40, 65);
 | 
			
		||||
	double fanCpu = fan_curve(cpu.reading, 0.33, 1.0, 40, 70);
 | 
			
		||||
	if(hitCpu && hitSystem)
 | 
			
		||||
	{
 | 
			
		||||
		double fanSystem = fan_curve(system.reading, 0.33, 1.0, 40, 65);
 | 
			
		||||
		double fanCpu = fan_curve(cpu.reading, 0.33, 1.0, 40, 70);
 | 
			
		||||
 | 
			
		||||
	return std::max(fanSystem, fanCpu);
 | 
			
		||||
		return std::max(fanSystem, fanCpu);
 | 
			
		||||
	}
 | 
			
		||||
	else
 | 
			
		||||
	{
 | 
			
		||||
		std::cerr<<"Could not get temperature from System or Cpu! Ramping fans to maximum\n";
 | 
			
		||||
		return 1;
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
std::vector<double> get_fan_zones(const std::vector<Sensor>& sensors)
 | 
			
		||||
| 
						 | 
				
			
			@ -114,20 +142,24 @@ std::vector<double> get_fan_zones(const std::vector<Sensor>& sensors)
 | 
			
		|||
	return out;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int main(int argc, char **argv)
 | 
			
		||||
int main_loop()
 | 
			
		||||
{
 | 
			
		||||
	signal(SIGABRT, sig_handler);
 | 
			
		||||
	signal(SIGTERM, sig_handler);
 | 
			
		||||
	signal(SIGHUP, sig_handler);
 | 
			
		||||
	signal(SIGINT, sig_handler);
 | 
			
		||||
 | 
			
		||||
	if(argc > 1)
 | 
			
		||||
		quiet = true;
 | 
			
		||||
	ipmi_ctx_t raw_ctx = ipmi_open_context();
 | 
			
		||||
	if(!raw_ctx)
 | 
			
		||||
	{
 | 
			
		||||
		sensors_cleanup();
 | 
			
		||||
		return 1;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	int ret = sensors_init(nullptr);
 | 
			
		||||
	if(ret < 0)
 | 
			
		||||
	{
 | 
			
		||||
		std::cerr<<"Could not init lm_sensors\n";
 | 
			
		||||
		ipmi_set_fan_group(raw_ctx, 0, 1);
 | 
			
		||||
		ipmi_set_fan_group(raw_ctx, 1, 1);
 | 
			
		||||
		ipmi_ctx_close(raw_ctx);
 | 
			
		||||
		ipmi_ctx_destroy(raw_ctx);
 | 
			
		||||
		return 1;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	std::vector<const sensors_chip_name*> lm_chips = lm_get_chips("amdgpu-*");
 | 
			
		||||
| 
						 | 
				
			
			@ -135,13 +167,28 @@ int main(int argc, char **argv)
 | 
			
		|||
	ipmi_sensors.push_back(Sensor("IPMI", "CPU Temp"));
 | 
			
		||||
	ipmi_sensors.push_back(Sensor("IPMI", "System Temp"));
 | 
			
		||||
 | 
			
		||||
	if(lm_chips.size() < 2)
 | 
			
		||||
	{
 | 
			
		||||
		std::cerr<<"Could not get both monitored gpus!";
 | 
			
		||||
		ipmi_set_fan_group(raw_ctx, 0, 1);
 | 
			
		||||
		ipmi_set_fan_group(raw_ctx, 1, 1);
 | 
			
		||||
		ipmi_ctx_close(raw_ctx);
 | 
			
		||||
		ipmi_ctx_destroy(raw_ctx);
 | 
			
		||||
		sensors_cleanup();
 | 
			
		||||
		return 1;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	ipmi_monitoring_ctx_t monitoring_ctx = init_ipmi_monitoring();
 | 
			
		||||
	if(!monitoring_ctx)
 | 
			
		||||
	{
 | 
			
		||||
		ipmi_set_fan_group(raw_ctx, 0, 1);
 | 
			
		||||
		ipmi_set_fan_group(raw_ctx, 1, 1);
 | 
			
		||||
		ipmi_ctx_close(raw_ctx);
 | 
			
		||||
		ipmi_ctx_destroy(raw_ctx);
 | 
			
		||||
		sensors_cleanup();
 | 
			
		||||
		return 1;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	ipmi_ctx_t raw_ctx = ipmi_open_context();
 | 
			
		||||
	if(!raw_ctx)
 | 
			
		||||
		return 1;
 | 
			
		||||
 | 
			
		||||
	while(running)
 | 
			
		||||
	{
 | 
			
		||||
| 
						 | 
				
			
			@ -161,10 +208,39 @@ int main(int argc, char **argv)
 | 
			
		|||
		sleep(10);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	ipmi_set_fan_group(raw_ctx, 0, 1);
 | 
			
		||||
	ipmi_set_fan_group(raw_ctx, 1, 1);
 | 
			
		||||
	ipmi_ctx_close(raw_ctx);
 | 
			
		||||
	ipmi_ctx_destroy(raw_ctx);
 | 
			
		||||
	ipmi_monitoring_ctx_destroy(monitoring_ctx);
 | 
			
		||||
	sensors_cleanup();
 | 
			
		||||
 | 
			
		||||
	return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int main (int argc, char **argv)
 | 
			
		||||
{
 | 
			
		||||
	signal(SIGABRT, sig_handler);
 | 
			
		||||
	signal(SIGTERM, sig_handler);
 | 
			
		||||
	signal(SIGHUP, sig_handler);
 | 
			
		||||
	signal(SIGINT, sig_handler);
 | 
			
		||||
 | 
			
		||||
	if(argc > 1)
 | 
			
		||||
		quiet = true;
 | 
			
		||||
 | 
			
		||||
	int ret = 0;
 | 
			
		||||
	for(size_t i = 0; i < 3; ++i)
 | 
			
		||||
	{
 | 
			
		||||
		ret = main_loop();
 | 
			
		||||
		if(!running)
 | 
			
		||||
			break;
 | 
			
		||||
		std::cerr<<"Mainloop unable to start, retrying in 10 sec\n";
 | 
			
		||||
		sleep(10);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if(ret != 0)
 | 
			
		||||
		std::cerr<<"Error not clearing, giveing up\n";
 | 
			
		||||
 | 
			
		||||
	return ret;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue