229 lines
6.2 KiB
C++
229 lines
6.2 KiB
C++
#include <cstddef>
|
|
#include <cstdint>
|
|
#include <freeipmi/api/ipmi-api.h>
|
|
#include <vector>
|
|
#include <string>
|
|
#include <iostream>
|
|
#include <algorithm>
|
|
#include <unistd.h>
|
|
#include <sensors/sensors.h>
|
|
#include <sensors/error.h>
|
|
#include <signal.h>
|
|
#include <limits>
|
|
#include <array>
|
|
#include <freeipmi/freeipmi.h>
|
|
#include <fandevice.h>
|
|
|
|
#include "ipmi.h"
|
|
#include "lm.h"
|
|
#include "fan.h"
|
|
#include "ipmifan.h"
|
|
#include "fandevicefan.h"
|
|
#include "fanzone.h"
|
|
|
|
sig_atomic_t running = true;
|
|
|
|
void sig_handler(int sig)
|
|
{
|
|
(void)sig;
|
|
running = false;
|
|
}
|
|
|
|
bool quiet;
|
|
|
|
std::vector<Sensor> gather_sensors(std::vector<Sensor>& ipmi_sensors, ipmi_monitoring_ctx_t ctx, std::vector<const sensors_chip_name*>& lm_chips)
|
|
{
|
|
std::vector<Sensor> out;
|
|
struct ipmi_monitoring_ipmi_config ipmi_config = {};
|
|
ipmi_config.driver_type = IPMI_MONITORING_DRIVER_TYPE_OPENIPMI;
|
|
|
|
bool grabids = false;
|
|
for(Sensor& sensor : ipmi_sensors)
|
|
{
|
|
if(sensor.id <= 0)
|
|
{
|
|
grabids = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if(grabids)
|
|
{
|
|
if(!ipmi_fill_sensor_ids(ipmi_sensors, ctx, &ipmi_config))
|
|
{
|
|
std::cout<<"could not get ids for all the required sensors\n";
|
|
return out;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
ipmi_update_sensors(ipmi_sensors, ctx, &ipmi_config);
|
|
}
|
|
|
|
out.insert(out.end(), ipmi_sensors.begin(), ipmi_sensors.end());
|
|
std::vector<Sensor> lm_sensors = lm_get_temperatures(lm_chips);
|
|
out.insert(out.end(), lm_sensors.begin(), lm_sensors.end());
|
|
|
|
return out;
|
|
}
|
|
|
|
static double fan_curve(double temperature, double min_fan, double max_fan, double low_temperature, double high_temperature, bool stop)
|
|
{
|
|
if(stop && temperature <low_temperature)
|
|
return 0;
|
|
double slope = (max_fan-min_fan)/(high_temperature-low_temperature);
|
|
return std::max(std::min(max_fan, min_fan+slope*(temperature-low_temperature)), min_fan);
|
|
}
|
|
|
|
static double mi100_fan_curve(double temperature, double min_fan, double max_fan, double low_temperature, double high_temperature,
|
|
double push_down_low_temperature, double push_down_high_temperature, bool &push_down_state)
|
|
{
|
|
double speed = fan_curve(temperature, min_fan, max_fan, low_temperature, high_temperature, false);
|
|
if(push_down_state)
|
|
speed = std::max(speed, 0.6);
|
|
if(temperature < push_down_low_temperature)
|
|
push_down_state = false;
|
|
else if(temperature > push_down_high_temperature)
|
|
push_down_state = true;
|
|
if(temperature > high_temperature)
|
|
return std::min((1-max_fan)*((temperature-high_temperature)/5.0)+max_fan, 1.0);
|
|
|
|
return speed;
|
|
}
|
|
|
|
void ipmi_cleanup(ipmi_ctx_t raw_ctx)
|
|
{
|
|
ipmi_set_fan_group(raw_ctx, 0, 1);
|
|
ipmi_set_fan_group(raw_ctx, 1, 1);
|
|
ipmi_ctx_close(raw_ctx);
|
|
ipmi_ctx_destroy(raw_ctx);
|
|
}
|
|
|
|
int main_loop()
|
|
{
|
|
ipmi_ctx_t raw_ctx = ipmi_open_context();
|
|
if(!raw_ctx)
|
|
{
|
|
std::cerr<<"Unable to connect to impi\n";
|
|
return 1;
|
|
}
|
|
|
|
int ret = sensors_init(nullptr);
|
|
if(ret < 0)
|
|
{
|
|
std::cerr<<"Could not init lm_sensors\n";
|
|
ipmi_cleanup(raw_ctx);
|
|
return 1;
|
|
}
|
|
|
|
std::vector<const sensors_chip_name*> lm_chips = lm_get_chips("amdgpu-*");
|
|
std::vector<Sensor> ipmi_sensors;
|
|
ipmi_sensors.push_back(Sensor("IPMI", "CPU Temp"));
|
|
ipmi_sensors.push_back(Sensor("IPMI", "System Temp"));
|
|
|
|
std::vector<Sensor> lmSensors;
|
|
lmSensors.push_back(Sensor("amdgpu-pci-0300", "edge"));
|
|
lmSensors.push_back(Sensor("amdgpu-pci-8300", "edge"));
|
|
|
|
if(lm_chips.size() < 2)
|
|
{
|
|
std::cerr<<"Could not get enough monitored gpus!\n";
|
|
ipmi_cleanup(raw_ctx);
|
|
sensors_cleanup();
|
|
return 1;
|
|
}
|
|
|
|
ipmi_monitoring_ctx_t monitoring_ctx = init_ipmi_monitoring();
|
|
if(!monitoring_ctx)
|
|
{
|
|
std::cerr<<"Unable to connect to impi for monitoring\n";
|
|
ipmi_cleanup(raw_ctx);
|
|
sensors_cleanup();
|
|
return 1;
|
|
}
|
|
|
|
struct fandevice fdevice;
|
|
ret = fandevice_connect(&fdevice, 0);
|
|
if(ret < 0)
|
|
{
|
|
std::cerr<<"Unable to connect to FanDevice\n";
|
|
ipmi_cleanup(raw_ctx);
|
|
sensors_cleanup();
|
|
return 1;
|
|
}
|
|
|
|
std::vector<Fan*> fans;
|
|
fans.push_back(new IpmiFan(raw_ctx, 0, "IPMI CPU FAN"));
|
|
fans.push_back(new IpmiFan(raw_ctx, 1, "IPMI SYSTEM FAN"));
|
|
fans.push_back(new FanDeviceFan(&fdevice, FAN_A, "MI100_1 FAN"));
|
|
fans.push_back(new FanDeviceFan(&fdevice, FAN_B, "MI100_2 FAN"));
|
|
fans.push_back(new FanDeviceFan(&fdevice, FAN_D, "TOP SYSTEM FAN"));
|
|
fans.push_back(new FanDeviceFan(&fdevice, FAN_C, "FRONT SYSTEM FAN"));
|
|
|
|
std::array<bool, 2> pushDownStates = {true, true};
|
|
|
|
std::vector<FanZone*> fanZones;
|
|
fanZones.push_back(new FanZone(ipmi_sensors[0], fans[0], [](double in){return fan_curve(in, 0.1, 1, 45, 65, false);}, "CPU FAN ZONE"));
|
|
fanZones.push_back(new FanZone({ipmi_sensors[0], ipmi_sensors[1]}, fans[1], [](double in){return fan_curve(in, 0.2, 1, 40, 55, false);}, "SYSTEM FAN ZONE"));
|
|
fanZones.push_back(new FanZone({ipmi_sensors[0], ipmi_sensors[1]}, fans[4], [](double in){return fan_curve(in, 0.5, 1, 60, 65, true);}, "TOP FAN ZONE"));
|
|
fanZones.push_back(new FanZone({lmSensors[0], lmSensors[1]}, fans[5], [](double in){return fan_curve(in, 0, 1, 60, 80, true);}, "FRONT FAN ZONE"));
|
|
fanZones.push_back(new FanZone(lmSensors[1], fans[2], [&pushDownStates](double in){return mi100_fan_curve(in, 0.14, 0.7, 65, 80, 50, 70, pushDownStates[0]);}, "MI100_1 FAN ZONE"));
|
|
fanZones.push_back(new FanZone(lmSensors[0], fans[3], [&pushDownStates](double in){return mi100_fan_curve(in, 0.14, 0.7, 65, 80, 50, 70, pushDownStates[1]);}, "MI100_2 FAN ZONE"));
|
|
while(running)
|
|
{
|
|
std::vector<Sensor> sensors = gather_sensors(ipmi_sensors, monitoring_ctx, lm_chips);
|
|
|
|
if(!quiet)
|
|
{
|
|
for(const Sensor& sensor : sensors)
|
|
std::cout<<sensor.chip<<' '<<sensor.name<<": "<<sensor.reading<<'\n';
|
|
|
|
for(FanZone* zone : fanZones)
|
|
zone->print(sensors);
|
|
}
|
|
|
|
for(FanZone* zone : fanZones)
|
|
zone->step(sensors);
|
|
std::cout<<'\n';
|
|
sleep(10);
|
|
}
|
|
|
|
for(FanZone* zone : fanZones)
|
|
delete zone;
|
|
for(Fan* fan : fans)
|
|
delete fan;
|
|
|
|
ipmi_cleanup(raw_ctx);
|
|
ipmi_monitoring_ctx_destroy(monitoring_ctx);
|
|
sensors_cleanup();
|
|
|
|
return 0;
|
|
}
|
|
|
|
int main (int argc, char **argv)
|
|
{
|
|
signal(SIGABRT, sig_handler);
|
|
signal(SIGTERM, sig_handler);
|
|
signal(SIGHUP, sig_handler);
|
|
signal(SIGINT, sig_handler);
|
|
|
|
if(argc > 1)
|
|
quiet = true;
|
|
|
|
int ret = 0;
|
|
for(size_t i = 0; i < 3; ++i)
|
|
{
|
|
ret = main_loop();
|
|
if(!running)
|
|
break;
|
|
std::cerr<<"Mainloop unable to start, retrying in 10 sec\n";
|
|
sleep(10);
|
|
}
|
|
|
|
if(ret != 0)
|
|
std::cerr<<"Error not clearing, giveing up\n";
|
|
|
|
return ret;
|
|
}
|
|
|