Compare commits

...

11 Commits

Author SHA1 Message Date
8c93f843b5 update fur current gpu config 2024-11-14 12:36:38 +01:00
ef6b0c7d4b fix bug where unmonitored gpus contribute to fan speed 2024-01-11 19:29:02 +01:00
2aa8d88f32 add mi25 2023-11-01 12:06:01 +01:00
79a11e6214 set correct pci id for second MI50 2023-09-14 22:22:20 +02:00
9a1d3d301f update for 2x mi50 2023-09-14 16:59:37 +02:00
6d767271f3 lower the minimum speed for the gpu fan zone 2023-05-11 11:01:16 +02:00
77592844fb Increase system safety and ensure fans fail in a safe mode 2023-05-04 14:02:17 +02:00
c084c42794 add the option to disable output 2023-05-03 21:08:37 +02:00
b1c92cd561 add service 2023-05-03 10:52:48 +02:00
a6e6b758e4 ajust parameters 2023-05-02 21:46:59 +02:00
913cea1603 add ipmi fan controll support 2023-05-02 18:58:52 +02:00
6 changed files with 213 additions and 66 deletions

View File

@ -4,14 +4,28 @@ project(ipmifan LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 20)
find_package(Doxygen)
if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
set(CMAKE_INSTALL_PREFIX "/usr" CACHE PATH "..." FORCE)
endif(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
find_package(PkgConfig REQUIRED)
pkg_search_module(IPMI REQUIRED libipmimonitoring)
pkg_search_module(IPMI_MONITORING REQUIRED libipmimonitoring)
pkg_search_module(IPMI REQUIRED libfreeipmi)
pkg_search_module(FANDEVICE REQUIRED fandevice)
pkg_search_module(SYSTEMD systemd)
add_executable(${PROJECT_NAME} main.cpp ipmi.cpp lm.cpp)
target_link_libraries(${PROJECT_NAME} ${IPMI_LINK_LIBRARIES} ${IPMIPOSIX_LINK_LIBRARIES} sensors)
target_include_directories(${PROJECT_NAME} PRIVATE ${IPMI_INCLUDE_DIRS} ${IPMIPOSIX_INCLUDE_DIRS})
add_executable(${PROJECT_NAME} main.cpp ipmi.cpp lm.cpp ipmifan.cpp fandevicefan.cpp fanzone.cpp)
target_link_libraries(${PROJECT_NAME} ${IPMI_LINK_LIBRARIES} ${FANDEVICE_LINK_LIBRARIES} ${IPMI_MONITORING_LINK_LIBRARIES} sensors)
target_include_directories(${PROJECT_NAME} PRIVATE ${IPMI_INCLUDE_DIRS} ${FANDEVICE_INCLUDE_DIRS} ${IPMI_MONITORING_INCLUDE_DIRS})
target_compile_options(${PROJECT_NAME} PRIVATE "-Wall" "-O2" "-g" "-fno-strict-aliasing" "-Wfatal-errors" "-Wno-reorder")
install(TARGETS ${PROJECT_NAME} RUNTIME DESTINATION bin)
if(SYSTEMD_FOUND)
pkg_get_variable(SYSTEMD_UNIT_DIR_PKG systemd systemd_system_unit_path)
message(STATUS ${SYSTEMD_UNIT_DIR_PKG})
string(REPLACE ":" ";" SYSTEMD_UNIT_DIR_LIST ${SYSTEMD_UNIT_DIR_PKG})
list(GET SYSTEMD_UNIT_DIR_LIST 0 SYSTEMD_UNIT_DIR)
install(FILES ipmifan.service DESTINATION ${SYSTEMD_UNIT_DIR})
endif(SYSTEMD_FOUND)

View File

@ -2,6 +2,8 @@
#include <algorithm>
#include <iostream>
static constexpr size_t IPMI_RAW_MAX_ARGS = 65536*2;
static double ipmi_convert_sensor_reading(void *sensor_reading, int sensor_reading_type)
{
if(sensor_reading_type == IPMI_MONITORING_SENSOR_READING_TYPE_UNSIGNED_INTEGER8_BOOL)
@ -102,3 +104,37 @@ ipmi_monitoring_ctx_t init_ipmi_monitoring()
return ctx;
}
ipmi_ctx_t ipmi_open_context()
{
ipmi_ctx_t ctx = ipmi_ctx_create();
if(!ctx)
{
std::cerr<<"Could not allocae raw context\n";
return nullptr;
}
ipmi_driver_type_t driver = IPMI_DEVICE_OPENIPMI;
int ret = ipmi_ctx_find_inband(ctx, &driver, false, 0, 0, nullptr, 0, 0);
if(ret < 0)
{
std::cerr<<"Could not create raw context "<<ipmi_ctx_errormsg(ctx)<<'\n';
ipmi_ctx_destroy(ctx);
return nullptr;
}
return ctx;
}
bool ipmi_set_fan_group(ipmi_ctx_t raw_ctx, uint8_t group, double speed)
{
char converted_speed = std::max(std::min(static_cast<char>(100), static_cast<char>(speed*100)), static_cast<char>(0));
char command[] = {0x70, 0x66, 0x01, static_cast<char>(group), converted_speed};
char bytesrx[IPMI_RAW_MAX_ARGS] = {0};
int rxlen = ipmi_cmd_raw(raw_ctx, 0, 0x30, command, sizeof(command), bytesrx, IPMI_RAW_MAX_ARGS);
if(rxlen < 0)
{
std::cerr<<"Raw write to ipmi failed with: "<<ipmi_ctx_errormsg(raw_ctx);
return false;
}
return true;
}

5
ipmi.h
View File

@ -1,6 +1,7 @@
#include <vector>
#include <string>
#include <ipmi_monitoring.h>
#include <freeipmi/freeipmi.h>
#include "sensor.h"
@ -11,3 +12,7 @@ bool ipmi_fill_sensor_ids(std::vector<Sensor>& sensors, ipmi_monitoring_ctx_t ct
bool ipmi_update_sensors(std::vector<Sensor>& sensors, ipmi_monitoring_ctx_t ctx, struct ipmi_monitoring_ipmi_config* config);
ipmi_monitoring_ctx_t init_ipmi_monitoring();
ipmi_ctx_t ipmi_open_context();
bool ipmi_set_fan_group(ipmi_ctx_t raw_ctx, uint8_t group, double speed);

10
ipmifan.service Normal file
View File

@ -0,0 +1,10 @@
[Unit]
Description=Start impi fan control
After=lm_sensors.service systemd-modules-load.service
[Service]
Type=simple
ExecStart=/usr/bin/ipmifan -q
[Install]
WantedBy=multi-user.target

202
main.cpp
View File

@ -1,3 +1,6 @@
#include <cstddef>
#include <cstdint>
#include <freeipmi/api/ipmi-api.h>
#include <vector>
#include <string>
#include <iostream>
@ -7,9 +10,16 @@
#include <sensors/error.h>
#include <signal.h>
#include <limits>
#include <array>
#include <freeipmi/freeipmi.h>
#include <fandevice.h>
#include "ipmi.h"
#include "lm.h"
#include "fan.h"
#include "ipmifan.h"
#include "fandevicefan.h"
#include "fanzone.h"
sig_atomic_t running = true;
@ -19,6 +29,8 @@ void sig_handler(int sig)
running = false;
}
bool quiet;
std::vector<Sensor> gather_sensors(std::vector<Sensor>& ipmi_sensors, ipmi_monitoring_ctx_t ctx, std::vector<const sensors_chip_name*>& lm_chips)
{
std::vector<Sensor> out;
@ -55,57 +67,137 @@ std::vector<Sensor> gather_sensors(std::vector<Sensor>& ipmi_sensors, ipmi_monit
return out;
}
double fan_curve(double temperature, double min_fan, double max_fan, double low_temperature, double high_temperature)
static double fan_curve(double temperature, double min_fan, double max_fan, double low_temperature, double high_temperature, bool stop)
{
if(stop && temperature <low_temperature)
return 0;
double slope = (max_fan-min_fan)/(high_temperature-low_temperature);
return std::max(std::min(max_fan, min_fan+slope*(temperature-low_temperature)), min_fan);
}
double gpu_fan_zone(const std::vector<Sensor>& sensors)
static double mi100_fan_curve(double temperature, double min_fan, double max_fan, double low_temperature, double high_temperature,
double push_down_low_temperature, double push_down_high_temperature, bool &push_down_state)
{
const char mi50Chip[] = "amdgpu-pci-2300";
const char mi25Chip[] = "amdgpu-pci-4300";
const char monitored_sensor_name[] = "edge";
double speed = fan_curve(temperature, min_fan, max_fan, low_temperature, high_temperature, false);
if(push_down_state)
speed = std::max(speed, 0.6);
if(temperature < push_down_low_temperature)
push_down_state = false;
else if(temperature > push_down_high_temperature)
push_down_state = true;
if(temperature > high_temperature)
return std::min((1-max_fan)*((temperature-high_temperature)/5.0)+max_fan, 1.0);
double max_temp = std::numeric_limits<double>::min();
return speed;
}
void ipmi_cleanup(ipmi_ctx_t raw_ctx)
{
ipmi_set_fan_group(raw_ctx, 0, 1);
ipmi_set_fan_group(raw_ctx, 1, 1);
ipmi_ctx_close(raw_ctx);
ipmi_ctx_destroy(raw_ctx);
}
int main_loop()
{
ipmi_ctx_t raw_ctx = ipmi_open_context();
if(!raw_ctx)
{
std::cerr<<"Unable to connect to impi\n";
return 1;
}
int ret = sensors_init(nullptr);
if(ret < 0)
{
std::cerr<<"Could not init lm_sensors\n";
ipmi_cleanup(raw_ctx);
return 1;
}
std::vector<const sensors_chip_name*> lm_chips = lm_get_chips("amdgpu-*");
std::vector<Sensor> ipmi_sensors;
ipmi_sensors.push_back(Sensor("IPMI", "CPU Temp"));
ipmi_sensors.push_back(Sensor("IPMI", "System Temp"));
std::vector<Sensor> lmSensors;
lmSensors.push_back(Sensor("amdgpu-pci-0300", "edge"));
lmSensors.push_back(Sensor("amdgpu-pci-8300", "edge"));
if(lm_chips.size() < 2)
{
std::cerr<<"Could not get enough monitored gpus!\n";
ipmi_cleanup(raw_ctx);
sensors_cleanup();
return 1;
}
ipmi_monitoring_ctx_t monitoring_ctx = init_ipmi_monitoring();
if(!monitoring_ctx)
{
std::cerr<<"Unable to connect to impi for monitoring\n";
ipmi_cleanup(raw_ctx);
sensors_cleanup();
return 1;
}
struct fandevice fdevice;
ret = fandevice_connect(&fdevice, 0);
if(ret < 0)
{
std::cerr<<"Unable to connect to FanDevice\n";
ipmi_cleanup(raw_ctx);
sensors_cleanup();
return 1;
}
std::vector<Fan*> fans;
fans.push_back(new IpmiFan(raw_ctx, 0, "IPMI CPU FAN"));
fans.push_back(new IpmiFan(raw_ctx, 1, "IPMI SYSTEM FAN"));
fans.push_back(new FanDeviceFan(&fdevice, FAN_A, "MI100_1 FAN"));
fans.push_back(new FanDeviceFan(&fdevice, FAN_B, "MI100_2 FAN"));
fans.push_back(new FanDeviceFan(&fdevice, FAN_D, "TOP SYSTEM FAN"));
fans.push_back(new FanDeviceFan(&fdevice, FAN_C, "FRONT SYSTEM FAN"));
std::array<bool, 2> pushDownStates = {true, true};
std::vector<FanZone*> fanZones;
fanZones.push_back(new FanZone(ipmi_sensors[0], fans[0], [](double in){return fan_curve(in, 0.1, 1, 45, 65, false);}, "CPU FAN ZONE"));
fanZones.push_back(new FanZone({ipmi_sensors[0], ipmi_sensors[1]}, fans[1], [](double in){return fan_curve(in, 0.2, 1, 40, 55, false);}, "SYSTEM FAN ZONE"));
fanZones.push_back(new FanZone({ipmi_sensors[0], ipmi_sensors[1]}, fans[4], [](double in){return fan_curve(in, 0.5, 1, 60, 65, true);}, "TOP FAN ZONE"));
fanZones.push_back(new FanZone({lmSensors[0], lmSensors[1]}, fans[5], [](double in){return fan_curve(in, 0, 1, 60, 80, true);}, "FRONT FAN ZONE"));
fanZones.push_back(new FanZone(lmSensors[1], fans[2], [&pushDownStates](double in){return mi100_fan_curve(in, 0.14, 0.7, 65, 80, 50, 70, pushDownStates[0]);}, "MI100_1 FAN ZONE"));
fanZones.push_back(new FanZone(lmSensors[0], fans[3], [&pushDownStates](double in){return mi100_fan_curve(in, 0.14, 0.7, 65, 80, 50, 70, pushDownStates[1]);}, "MI100_2 FAN ZONE"));
while(running)
{
std::vector<Sensor> sensors = gather_sensors(ipmi_sensors, monitoring_ctx, lm_chips);
if(!quiet)
{
for(const Sensor& sensor : sensors)
{
if((sensor.chip == mi50Chip || sensor.chip == mi25Chip) && sensor.name == monitored_sensor_name)
{
if(max_temp < sensor.reading)
max_temp = sensor.reading;
}
std::cout<<sensor.chip<<' '<<sensor.name<<": "<<sensor.reading<<'\n';
for(FanZone* zone : fanZones)
zone->print(sensors);
}
return fan_curve(max_temp, 0.2, 1.0, 40, 75);
for(FanZone* zone : fanZones)
zone->step(sensors);
std::cout<<'\n';
sleep(10);
}
double system_fan_zone(const std::vector<Sensor>& sensors)
{
Sensor cpu("IPMI", "CPU Temp");
Sensor system("IPMI", "System Temp");
std::vector<double> out;
for(FanZone* zone : fanZones)
delete zone;
for(Fan* fan : fans)
delete fan;
for(const Sensor& sensor : sensors)
{
if(cpu == sensor)
cpu = sensor;
else if(sensor == system)
system = sensor;
}
ipmi_cleanup(raw_ctx);
ipmi_monitoring_ctx_destroy(monitoring_ctx);
sensors_cleanup();
double fanSystem = fan_curve(system.reading, 0.2, 1.0, 35, 45);
double fanCpu = fan_curve(cpu.reading, 0.2, 1.0, 40, 70);
return std::max(fanSystem, fanCpu);
}
std::vector<double> get_fan_zones(const std::vector<Sensor>& sensors)
{
std::vector<double> out;
out.push_back(system_fan_zone(sensors));
out.push_back(gpu_fan_zone(sensors));
return out;
return 0;
}
int main (int argc, char **argv)
@ -115,32 +207,22 @@ int main (int argc, char **argv)
signal(SIGHUP, sig_handler);
signal(SIGINT, sig_handler);
int ret = sensors_init(nullptr);
if(ret < 0)
if(argc > 1)
quiet = true;
int ret = 0;
for(size_t i = 0; i < 3; ++i)
{
std::cerr<<"Could not init lm_sensors\n";
ret = main_loop();
if(!running)
break;
std::cerr<<"Mainloop unable to start, retrying in 10 sec\n";
sleep(10);
}
std::vector<const sensors_chip_name*> lm_chips = lm_get_chips("amdgpu-*");
std::vector<Sensor> ipmi_sensors;
ipmi_sensors.push_back(Sensor("IPMI", "CPU Temp"));
ipmi_sensors.push_back(Sensor("IPMI", "System Temp"));
if(ret != 0)
std::cerr<<"Error not clearing, giveing up\n";
ipmi_monitoring_ctx_t ctx = init_ipmi_monitoring();
if(!ctx)
return 1;
while(running)
{
std::vector<Sensor> sensors = gather_sensors(ipmi_sensors, ctx, lm_chips);
std::vector<double> fanzones = get_fan_zones(sensors);
for(const double fanzone : fanzones)
std::cout<<fanzone<<'\n';
sleep(1);
}
ipmi_monitoring_ctx_destroy(ctx);
sensors_cleanup();
return 0;
return ret;
}

View File

@ -13,5 +13,5 @@ public:
public:
Sensor() = default;
Sensor(std::string chipI, std::string nameI, int idI = 0): name(nameI), chip(chipI), id(idI) {}
bool operator==(const Sensor& other) {return other.name == name && other.chip == chip;}
bool operator==(const Sensor& other) const {return other.name == name && other.chip == chip;}
};