From 913cea1603b94531e6f1de0d480dfe614c9aa3ae Mon Sep 17 00:00:00 2001 From: Carl Philipp Klemm Date: Tue, 2 May 2023 18:58:52 +0200 Subject: [PATCH 01/11] add ipmi fan controll support --- CMakeLists.txt | 7 ++++--- main.cpp | 51 ++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 51 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4eb83d6..5f17f3a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,11 +7,12 @@ set(CMAKE_CXX_STANDARD 20) find_package(Doxygen) find_package(PkgConfig REQUIRED) -pkg_search_module(IPMI REQUIRED libipmimonitoring) +pkg_search_module(IPMI_MONITORING REQUIRED libipmimonitoring) +pkg_search_module(IPMI REQUIRED libfreeipmi) add_executable(${PROJECT_NAME} main.cpp ipmi.cpp lm.cpp) -target_link_libraries(${PROJECT_NAME} ${IPMI_LINK_LIBRARIES} ${IPMIPOSIX_LINK_LIBRARIES} sensors) -target_include_directories(${PROJECT_NAME} PRIVATE ${IPMI_INCLUDE_DIRS} ${IPMIPOSIX_INCLUDE_DIRS}) +target_link_libraries(${PROJECT_NAME} ${IPMI_LINK_LIBRARIES} ${IPMI_MONITORING_LINK_LIBRARIES} sensors) +target_include_directories(${PROJECT_NAME} PRIVATE ${IPMI_INCLUDE_DIRS} ${IPMI_MONITORING_INCLUDE_DIRS}) target_compile_options(${PROJECT_NAME} PRIVATE "-Wall" "-O2" "-g" "-fno-strict-aliasing" "-Wfatal-errors" "-Wno-reorder") install(TARGETS ${PROJECT_NAME} RUNTIME DESTINATION bin) diff --git a/main.cpp b/main.cpp index d8e1fb3..9798373 100644 --- a/main.cpp +++ b/main.cpp @@ -1,3 +1,6 @@ +#include +#include +#include #include #include #include @@ -7,10 +10,13 @@ #include #include #include +#include #include "ipmi.h" #include "lm.h" +static constexpr size_t IPMI_RAW_MAX_ARGS = 65536*2; + sig_atomic_t running = true; void sig_handler(int sig) @@ -108,6 +114,35 @@ std::vector get_fan_zones(const std::vector& sensors) return out; } +ipmi_ctx_t ipmi_open() +{ + ipmi_ctx_t ctx = nullptr; + + ipmi_driver_type_t driver = IPMI_DEVICE_OPENIPMI; + int ret = ipmi_ctx_find_inband(ctx, &driver, false, 0, 0, nullptr, 0, 0); + if(ret < 0) + { + std::cerr<<"Could not create raw context "<(64), static_cast(speed*64)), static_cast(0)); + + char command[] = {0x70, 0x66, 0x01, static_cast(group), converted_speed}; + char bytesrx[IPMI_RAW_MAX_ARGS] = {0}; + int rxlen = ipmi_cmd_raw(raw_ctx, 0, 0x30, command, sizeof(command), bytesrx, IPMI_RAW_MAX_ARGS); + if(rxlen < 0) + { + std::cerr<<"Raw write to ipmi failed with: "< sensors = gather_sensors(ipmi_sensors, ctx, lm_chips); + std::vector sensors = gather_sensors(ipmi_sensors, monitoring_ctx, lm_chips); std::vector fanzones = get_fan_zones(sensors); for(const double fanzone : fanzones) std::cout< Date: Tue, 2 May 2023 21:46:59 +0200 Subject: [PATCH 02/11] ajust parameters --- main.cpp | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/main.cpp b/main.cpp index 9798373..f23d03a 100644 --- a/main.cpp +++ b/main.cpp @@ -83,7 +83,7 @@ double gpu_fan_zone(const std::vector& sensors) } } - return fan_curve(max_temp, 0.2, 1.0, 40, 75); + return fan_curve(max_temp, 0.20, 1.0, 45, 75); } double system_fan_zone(const std::vector& sensors) @@ -100,8 +100,8 @@ double system_fan_zone(const std::vector& sensors) system = sensor; } - double fanSystem = fan_curve(system.reading, 0.2, 1.0, 35, 45); - double fanCpu = fan_curve(cpu.reading, 0.2, 1.0, 40, 70); + double fanSystem = fan_curve(system.reading, 0.33, 1.0, 40, 65); + double fanCpu = fan_curve(cpu.reading, 0.33, 1.0, 40, 70); return std::max(fanSystem, fanCpu); } @@ -116,13 +116,19 @@ std::vector get_fan_zones(const std::vector& sensors) ipmi_ctx_t ipmi_open() { - ipmi_ctx_t ctx = nullptr; + ipmi_ctx_t ctx = ipmi_ctx_create(); + if(!ctx) + { + std::cerr<<"Could not allocae raw context\n"; + return nullptr; + } ipmi_driver_type_t driver = IPMI_DEVICE_OPENIPMI; int ret = ipmi_ctx_find_inband(ctx, &driver, false, 0, 0, nullptr, 0, 0); if(ret < 0) { std::cerr<<"Could not create raw context "<(64), static_cast(speed*64)), static_cast(0)); + char converted_speed = std::max(std::min(static_cast(100), static_cast(speed*100)), static_cast(0)); + + std::cout<<"setting fan group "<(group)<<" to "<(converted_speed)<<")\n"; char command[] = {0x70, 0x66, 0x01, static_cast(group), converted_speed}; char bytesrx[IPMI_RAW_MAX_ARGS] = {0}; @@ -143,7 +151,7 @@ bool ipmi_set_fan_group(ipmi_ctx_t raw_ctx, uint8_t group, double speed) return true; } -int main (int argc, char **argv) +int main(int argc, char **argv) { signal(SIGABRT, sig_handler); signal(SIGTERM, sig_handler); @@ -169,15 +177,15 @@ int main (int argc, char **argv) if(!raw_ctx) return 1; - - while(running) { std::vector sensors = gather_sensors(ipmi_sensors, monitoring_ctx, lm_chips); + for(const Sensor& sensor : sensors) + std::cout<<"Sensor "< fanzones = get_fan_zones(sensors); - for(const double fanzone : fanzones) - std::cout< Date: Wed, 3 May 2023 10:52:48 +0200 Subject: [PATCH 03/11] add service --- CMakeLists.txt | 14 +++++++++++++- ipmi.cpp | 39 +++++++++++++++++++++++++++++++++++++++ ipmi.h | 5 +++++ ipmifan.service | 10 ++++++++++ main.cpp | 41 +---------------------------------------- 5 files changed, 68 insertions(+), 41 deletions(-) create mode 100644 ipmifan.service diff --git a/CMakeLists.txt b/CMakeLists.txt index 5f17f3a..4ba30c9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,11 +4,15 @@ project(ipmifan LANGUAGES CXX) set(CMAKE_CXX_STANDARD 20) -find_package(Doxygen) +if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) + set(CMAKE_INSTALL_PREFIX "/usr" CACHE PATH "..." FORCE) +endif(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) find_package(PkgConfig REQUIRED) pkg_search_module(IPMI_MONITORING REQUIRED libipmimonitoring) pkg_search_module(IPMI REQUIRED libfreeipmi) +pkg_search_module(SYSTEMD systemd) + add_executable(${PROJECT_NAME} main.cpp ipmi.cpp lm.cpp) target_link_libraries(${PROJECT_NAME} ${IPMI_LINK_LIBRARIES} ${IPMI_MONITORING_LINK_LIBRARIES} sensors) @@ -16,3 +20,11 @@ target_include_directories(${PROJECT_NAME} PRIVATE ${IPMI_INCLUDE_DIRS} ${IPMI_M target_compile_options(${PROJECT_NAME} PRIVATE "-Wall" "-O2" "-g" "-fno-strict-aliasing" "-Wfatal-errors" "-Wno-reorder") install(TARGETS ${PROJECT_NAME} RUNTIME DESTINATION bin) +if(SYSTEMD_FOUND) + pkg_get_variable(SYSTEMD_UNIT_DIR_PKG systemd systemd_system_unit_path) + message(STATUS ${SYSTEMD_UNIT_DIR_PKG}) + string(REPLACE ":" ";" SYSTEMD_UNIT_DIR_LIST ${SYSTEMD_UNIT_DIR_PKG}) + list(GET SYSTEMD_UNIT_DIR_LIST 0 SYSTEMD_UNIT_DIR) + + install(FILES ipmifan.service DESTINATION ${SYSTEMD_UNIT_DIR}) +endif(SYSTEMD_FOUND) diff --git a/ipmi.cpp b/ipmi.cpp index ed4a024..4795c77 100644 --- a/ipmi.cpp +++ b/ipmi.cpp @@ -2,6 +2,8 @@ #include #include +static constexpr size_t IPMI_RAW_MAX_ARGS = 65536*2; + static double ipmi_convert_sensor_reading(void *sensor_reading, int sensor_reading_type) { if(sensor_reading_type == IPMI_MONITORING_SENSOR_READING_TYPE_UNSIGNED_INTEGER8_BOOL) @@ -102,3 +104,40 @@ ipmi_monitoring_ctx_t init_ipmi_monitoring() return ctx; } + +ipmi_ctx_t ipmi_open_context() +{ + ipmi_ctx_t ctx = ipmi_ctx_create(); + if(!ctx) + { + std::cerr<<"Could not allocae raw context\n"; + return nullptr; + } + + ipmi_driver_type_t driver = IPMI_DEVICE_OPENIPMI; + int ret = ipmi_ctx_find_inband(ctx, &driver, false, 0, 0, nullptr, 0, 0); + if(ret < 0) + { + std::cerr<<"Could not create raw context "<(100), static_cast(speed*100)), static_cast(0)); + + std::cout<<"setting fan group "<(group)<<" to "<(converted_speed)<<")\n"; + + char command[] = {0x70, 0x66, 0x01, static_cast(group), converted_speed}; + char bytesrx[IPMI_RAW_MAX_ARGS] = {0}; + int rxlen = ipmi_cmd_raw(raw_ctx, 0, 0x30, command, sizeof(command), bytesrx, IPMI_RAW_MAX_ARGS); + if(rxlen < 0) + { + std::cerr<<"Raw write to ipmi failed with: "< #include #include +#include #include "sensor.h" @@ -11,3 +12,7 @@ bool ipmi_fill_sensor_ids(std::vector& sensors, ipmi_monitoring_ctx_t ct bool ipmi_update_sensors(std::vector& sensors, ipmi_monitoring_ctx_t ctx, struct ipmi_monitoring_ipmi_config* config); ipmi_monitoring_ctx_t init_ipmi_monitoring(); + +ipmi_ctx_t ipmi_open_context(); + +bool ipmi_set_fan_group(ipmi_ctx_t raw_ctx, uint8_t group, double speed); diff --git a/ipmifan.service b/ipmifan.service new file mode 100644 index 0000000..dd595db --- /dev/null +++ b/ipmifan.service @@ -0,0 +1,10 @@ +[Unit] +Description=Start impi fan control +After=lm_sensors.service systemd-modules-load.service + +[Service] +Type=simple +ExecStart=/usr/bin/ipmifan + +[Install] +WantedBy=multi-user.target diff --git a/main.cpp b/main.cpp index f23d03a..7e71117 100644 --- a/main.cpp +++ b/main.cpp @@ -15,8 +15,6 @@ #include "ipmi.h" #include "lm.h" -static constexpr size_t IPMI_RAW_MAX_ARGS = 65536*2; - sig_atomic_t running = true; void sig_handler(int sig) @@ -114,43 +112,6 @@ std::vector get_fan_zones(const std::vector& sensors) return out; } -ipmi_ctx_t ipmi_open() -{ - ipmi_ctx_t ctx = ipmi_ctx_create(); - if(!ctx) - { - std::cerr<<"Could not allocae raw context\n"; - return nullptr; - } - - ipmi_driver_type_t driver = IPMI_DEVICE_OPENIPMI; - int ret = ipmi_ctx_find_inband(ctx, &driver, false, 0, 0, nullptr, 0, 0); - if(ret < 0) - { - std::cerr<<"Could not create raw context "<(100), static_cast(speed*100)), static_cast(0)); - - std::cout<<"setting fan group "<(group)<<" to "<(converted_speed)<<")\n"; - - char command[] = {0x70, 0x66, 0x01, static_cast(group), converted_speed}; - char bytesrx[IPMI_RAW_MAX_ARGS] = {0}; - int rxlen = ipmi_cmd_raw(raw_ctx, 0, 0x30, command, sizeof(command), bytesrx, IPMI_RAW_MAX_ARGS); - if(rxlen < 0) - { - std::cerr<<"Raw write to ipmi failed with: "< Date: Wed, 3 May 2023 21:08:37 +0200 Subject: [PATCH 04/11] add the option to disable output --- ipmi.cpp | 3 --- ipmifan.service | 2 +- main.cpp | 16 ++++++++++++++-- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/ipmi.cpp b/ipmi.cpp index 4795c77..e5aea0c 100644 --- a/ipmi.cpp +++ b/ipmi.cpp @@ -128,9 +128,6 @@ ipmi_ctx_t ipmi_open_context() bool ipmi_set_fan_group(ipmi_ctx_t raw_ctx, uint8_t group, double speed) { char converted_speed = std::max(std::min(static_cast(100), static_cast(speed*100)), static_cast(0)); - - std::cout<<"setting fan group "<(group)<<" to "<(converted_speed)<<")\n"; - char command[] = {0x70, 0x66, 0x01, static_cast(group), converted_speed}; char bytesrx[IPMI_RAW_MAX_ARGS] = {0}; int rxlen = ipmi_cmd_raw(raw_ctx, 0, 0x30, command, sizeof(command), bytesrx, IPMI_RAW_MAX_ARGS); diff --git a/ipmifan.service b/ipmifan.service index dd595db..30ab7df 100644 --- a/ipmifan.service +++ b/ipmifan.service @@ -4,7 +4,7 @@ After=lm_sensors.service systemd-modules-load.service [Service] Type=simple -ExecStart=/usr/bin/ipmifan +ExecStart=/usr/bin/ipmifan -q [Install] WantedBy=multi-user.target diff --git a/main.cpp b/main.cpp index 7e71117..e752719 100644 --- a/main.cpp +++ b/main.cpp @@ -23,6 +23,8 @@ void sig_handler(int sig) running = false; } +bool quiet; + std::vector gather_sensors(std::vector& ipmi_sensors, ipmi_monitoring_ctx_t ctx, std::vector& lm_chips) { std::vector out; @@ -119,6 +121,9 @@ int main(int argc, char **argv) signal(SIGHUP, sig_handler); signal(SIGINT, sig_handler); + if(argc > 1) + quiet = true; + int ret = sensors_init(nullptr); if(ret < 0) { @@ -141,9 +146,16 @@ int main(int argc, char **argv) while(running) { std::vector sensors = gather_sensors(ipmi_sensors, monitoring_ctx, lm_chips); - for(const Sensor& sensor : sensors) - std::cout<<"Sensor "< fanzones = get_fan_zones(sensors); + + if(!quiet) + { + for(const Sensor& sensor : sensors) + std::cout<<"Sensor "< Date: Thu, 4 May 2023 14:02:17 +0200 Subject: [PATCH 05/11] Increase system safety and ensure fans fail in a safe mode --- main.cpp | 106 +++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 91 insertions(+), 15 deletions(-) diff --git a/main.cpp b/main.cpp index e752719..ec205a1 100644 --- a/main.cpp +++ b/main.cpp @@ -71,6 +71,8 @@ double gpu_fan_zone(const std::vector& sensors) { const char mi50Chip[] = "amdgpu-pci-2300"; const char mi25Chip[] = "amdgpu-pci-4300"; + bool hitMi25 = false; + bool hitMi50 = false; const char monitored_sensor_name[] = "edge"; double max_temp = std::numeric_limits::min(); @@ -78,32 +80,58 @@ double gpu_fan_zone(const std::vector& sensors) { if((sensor.chip == mi50Chip || sensor.chip == mi25Chip) && sensor.name == monitored_sensor_name) { + if(sensor.chip == mi50Chip) + hitMi50 = true; + else + hitMi25 = true; if(max_temp < sensor.reading) max_temp = sensor.reading; } } - return fan_curve(max_temp, 0.20, 1.0, 45, 75); + if(!hitMi50 || !hitMi25) + { + std::cerr<<"Could not get temperature from MI25 or MI50! Ramping fans to maximum\n"; + return 1.0; + } + else + return fan_curve(max_temp, 0.20, 1.0, 45, 75); } double system_fan_zone(const std::vector& sensors) { Sensor cpu("IPMI", "CPU Temp"); Sensor system("IPMI", "System Temp"); + bool hitCpu = false; + bool hitSystem = false; std::vector out; for(const Sensor& sensor : sensors) { if(cpu == sensor) + { + hitCpu = true; cpu = sensor; + } else if(sensor == system) + { + hitSystem = true; system = sensor; + } } - double fanSystem = fan_curve(system.reading, 0.33, 1.0, 40, 65); - double fanCpu = fan_curve(cpu.reading, 0.33, 1.0, 40, 70); + if(hitCpu && hitSystem) + { + double fanSystem = fan_curve(system.reading, 0.33, 1.0, 40, 65); + double fanCpu = fan_curve(cpu.reading, 0.33, 1.0, 40, 70); - return std::max(fanSystem, fanCpu); + return std::max(fanSystem, fanCpu); + } + else + { + std::cerr<<"Could not get temperature from System or Cpu! Ramping fans to maximum\n"; + return 1; + } } std::vector get_fan_zones(const std::vector& sensors) @@ -114,20 +142,24 @@ std::vector get_fan_zones(const std::vector& sensors) return out; } -int main(int argc, char **argv) +int main_loop() { - signal(SIGABRT, sig_handler); - signal(SIGTERM, sig_handler); - signal(SIGHUP, sig_handler); - signal(SIGINT, sig_handler); - - if(argc > 1) - quiet = true; + ipmi_ctx_t raw_ctx = ipmi_open_context(); + if(!raw_ctx) + { + sensors_cleanup(); + return 1; + } int ret = sensors_init(nullptr); if(ret < 0) { std::cerr<<"Could not init lm_sensors\n"; + ipmi_set_fan_group(raw_ctx, 0, 1); + ipmi_set_fan_group(raw_ctx, 1, 1); + ipmi_ctx_close(raw_ctx); + ipmi_ctx_destroy(raw_ctx); + return 1; } std::vector lm_chips = lm_get_chips("amdgpu-*"); @@ -135,13 +167,28 @@ int main(int argc, char **argv) ipmi_sensors.push_back(Sensor("IPMI", "CPU Temp")); ipmi_sensors.push_back(Sensor("IPMI", "System Temp")); + if(lm_chips.size() < 2) + { + std::cerr<<"Could not get both monitored gpus!"; + ipmi_set_fan_group(raw_ctx, 0, 1); + ipmi_set_fan_group(raw_ctx, 1, 1); + ipmi_ctx_close(raw_ctx); + ipmi_ctx_destroy(raw_ctx); + sensors_cleanup(); + return 1; + } + ipmi_monitoring_ctx_t monitoring_ctx = init_ipmi_monitoring(); if(!monitoring_ctx) + { + ipmi_set_fan_group(raw_ctx, 0, 1); + ipmi_set_fan_group(raw_ctx, 1, 1); + ipmi_ctx_close(raw_ctx); + ipmi_ctx_destroy(raw_ctx); + sensors_cleanup(); return 1; + } - ipmi_ctx_t raw_ctx = ipmi_open_context(); - if(!raw_ctx) - return 1; while(running) { @@ -161,10 +208,39 @@ int main(int argc, char **argv) sleep(10); } + ipmi_set_fan_group(raw_ctx, 0, 1); + ipmi_set_fan_group(raw_ctx, 1, 1); ipmi_ctx_close(raw_ctx); ipmi_ctx_destroy(raw_ctx); ipmi_monitoring_ctx_destroy(monitoring_ctx); sensors_cleanup(); + return 0; } +int main (int argc, char **argv) +{ + signal(SIGABRT, sig_handler); + signal(SIGTERM, sig_handler); + signal(SIGHUP, sig_handler); + signal(SIGINT, sig_handler); + + if(argc > 1) + quiet = true; + + int ret = 0; + for(size_t i = 0; i < 3; ++i) + { + ret = main_loop(); + if(!running) + break; + std::cerr<<"Mainloop unable to start, retrying in 10 sec\n"; + sleep(10); + } + + if(ret != 0) + std::cerr<<"Error not clearing, giveing up\n"; + + return ret; +} + From 6d767271f388dfeab5576da39358e3eb2860459e Mon Sep 17 00:00:00 2001 From: uvos Date: Thu, 11 May 2023 11:01:16 +0200 Subject: [PATCH 06/11] lower the minimum speed for the gpu fan zone --- main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.cpp b/main.cpp index ec205a1..c4fbb9e 100644 --- a/main.cpp +++ b/main.cpp @@ -95,7 +95,7 @@ double gpu_fan_zone(const std::vector& sensors) return 1.0; } else - return fan_curve(max_temp, 0.20, 1.0, 45, 75); + return fan_curve(max_temp, 0.10, 1.0, 45, 75); } double system_fan_zone(const std::vector& sensors) From 9a1d3d301f0208295d25c2b9bbb3a0be6fe138da Mon Sep 17 00:00:00 2001 From: uvos Date: Thu, 14 Sep 2023 16:59:37 +0200 Subject: [PATCH 07/11] update for 2x mi50 --- main.cpp | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/main.cpp b/main.cpp index c4fbb9e..3f521eb 100644 --- a/main.cpp +++ b/main.cpp @@ -69,33 +69,35 @@ double fan_curve(double temperature, double min_fan, double max_fan, double low_ double gpu_fan_zone(const std::vector& sensors) { - const char mi50Chip[] = "amdgpu-pci-2300"; - const char mi25Chip[] = "amdgpu-pci-4300"; - bool hitMi25 = false; - bool hitMi50 = false; + std::vector> gpus = {{"amdgpu-pci-0300", false}, {"amdgpu-pci-c300", false}}; const char monitored_sensor_name[] = "edge"; double max_temp = std::numeric_limits::min(); for(const Sensor& sensor : sensors) { - if((sensor.chip == mi50Chip || sensor.chip == mi25Chip) && sensor.name == monitored_sensor_name) + if(sensor.name == monitored_sensor_name) { - if(sensor.chip == mi50Chip) - hitMi50 = true; - else - hitMi25 = true; - if(max_temp < sensor.reading) - max_temp = sensor.reading; + for(std::pair& gpu : gpus) + { + if(sensor.chip == gpu.first) + { + gpu.second = true; + } + if(max_temp < sensor.reading) + max_temp = sensor.reading; + } + } + } + for(std::pair& gpu : gpus) + { + if(!gpu.second) + { + std::cerr<<"Could not get temperature from "<& sensors) From 79a11e6214857553df17ae7c06539e5fa669ee1a Mon Sep 17 00:00:00 2001 From: uvos Date: Thu, 14 Sep 2023 22:22:20 +0200 Subject: [PATCH 08/11] set correct pci id for second MI50 --- main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.cpp b/main.cpp index 3f521eb..6654b8c 100644 --- a/main.cpp +++ b/main.cpp @@ -69,7 +69,7 @@ double fan_curve(double temperature, double min_fan, double max_fan, double low_ double gpu_fan_zone(const std::vector& sensors) { - std::vector> gpus = {{"amdgpu-pci-0300", false}, {"amdgpu-pci-c300", false}}; + std::vector> gpus = {{"amdgpu-pci-0300", false}, {"amdgpu-pci-8300", false}}; const char monitored_sensor_name[] = "edge"; double max_temp = std::numeric_limits::min(); From 2aa8d88f32444820a333cae039c047aeca34e76d Mon Sep 17 00:00:00 2001 From: uvos Date: Wed, 1 Nov 2023 12:06:01 +0100 Subject: [PATCH 09/11] add mi25 --- main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.cpp b/main.cpp index 6654b8c..ee65c0a 100644 --- a/main.cpp +++ b/main.cpp @@ -69,7 +69,7 @@ double fan_curve(double temperature, double min_fan, double max_fan, double low_ double gpu_fan_zone(const std::vector& sensors) { - std::vector> gpus = {{"amdgpu-pci-0300", false}, {"amdgpu-pci-8300", false}}; + std::vector> gpus = {{"amdgpu-pci-0300", false}, {"amdgpu-pci-8300", false}, {"amdgpu-pci-8900", false}}; const char monitored_sensor_name[] = "edge"; double max_temp = std::numeric_limits::min(); From ef6b0c7d4b90b6ecd3ac514009b9547352c700f1 Mon Sep 17 00:00:00 2001 From: uvos Date: Thu, 11 Jan 2024 19:29:02 +0100 Subject: [PATCH 10/11] fix bug where unmonitored gpus contribute to fan speed --- main.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/main.cpp b/main.cpp index ee65c0a..9b2b01a 100644 --- a/main.cpp +++ b/main.cpp @@ -81,10 +81,10 @@ double gpu_fan_zone(const std::vector& sensors) { if(sensor.chip == gpu.first) { + if(max_temp < sensor.reading) + max_temp = sensor.reading; gpu.second = true; } - if(max_temp < sensor.reading) - max_temp = sensor.reading; } } } @@ -97,7 +97,7 @@ double gpu_fan_zone(const std::vector& sensors) } } - return fan_curve(max_temp, 0.10, 1.0, 45, 75); + return fan_curve(max_temp, 0.05, 1.0, 45, 75); } double system_fan_zone(const std::vector& sensors) From 8c93f843b5d7f3975750499a99ef02d9227cda38 Mon Sep 17 00:00:00 2001 From: uvos Date: Thu, 14 Nov 2024 12:36:38 +0100 Subject: [PATCH 11/11] update fur current gpu config --- CMakeLists.txt | 7 +- main.cpp | 170 ++++++++++++++++++++++--------------------------- sensor.h | 2 +- 3 files changed, 80 insertions(+), 99 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4ba30c9..dffe7e7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,12 +11,13 @@ endif(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) find_package(PkgConfig REQUIRED) pkg_search_module(IPMI_MONITORING REQUIRED libipmimonitoring) pkg_search_module(IPMI REQUIRED libfreeipmi) +pkg_search_module(FANDEVICE REQUIRED fandevice) pkg_search_module(SYSTEMD systemd) -add_executable(${PROJECT_NAME} main.cpp ipmi.cpp lm.cpp) -target_link_libraries(${PROJECT_NAME} ${IPMI_LINK_LIBRARIES} ${IPMI_MONITORING_LINK_LIBRARIES} sensors) -target_include_directories(${PROJECT_NAME} PRIVATE ${IPMI_INCLUDE_DIRS} ${IPMI_MONITORING_INCLUDE_DIRS}) +add_executable(${PROJECT_NAME} main.cpp ipmi.cpp lm.cpp ipmifan.cpp fandevicefan.cpp fanzone.cpp) +target_link_libraries(${PROJECT_NAME} ${IPMI_LINK_LIBRARIES} ${FANDEVICE_LINK_LIBRARIES} ${IPMI_MONITORING_LINK_LIBRARIES} sensors) +target_include_directories(${PROJECT_NAME} PRIVATE ${IPMI_INCLUDE_DIRS} ${FANDEVICE_INCLUDE_DIRS} ${IPMI_MONITORING_INCLUDE_DIRS}) target_compile_options(${PROJECT_NAME} PRIVATE "-Wall" "-O2" "-g" "-fno-strict-aliasing" "-Wfatal-errors" "-Wno-reorder") install(TARGETS ${PROJECT_NAME} RUNTIME DESTINATION bin) diff --git a/main.cpp b/main.cpp index 9b2b01a..9faf0be 100644 --- a/main.cpp +++ b/main.cpp @@ -10,10 +10,16 @@ #include #include #include +#include #include +#include #include "ipmi.h" #include "lm.h" +#include "fan.h" +#include "ipmifan.h" +#include "fandevicefan.h" +#include "fanzone.h" sig_atomic_t running = true; @@ -61,87 +67,36 @@ std::vector gather_sensors(std::vector& ipmi_sensors, ipmi_monit return out; } -double fan_curve(double temperature, double min_fan, double max_fan, double low_temperature, double high_temperature) +static double fan_curve(double temperature, double min_fan, double max_fan, double low_temperature, double high_temperature, bool stop) { + if(stop && temperature & sensors) +static double mi100_fan_curve(double temperature, double min_fan, double max_fan, double low_temperature, double high_temperature, + double push_down_low_temperature, double push_down_high_temperature, bool &push_down_state) { - std::vector> gpus = {{"amdgpu-pci-0300", false}, {"amdgpu-pci-8300", false}, {"amdgpu-pci-8900", false}}; - const char monitored_sensor_name[] = "edge"; + double speed = fan_curve(temperature, min_fan, max_fan, low_temperature, high_temperature, false); + if(push_down_state) + speed = std::max(speed, 0.6); + if(temperature < push_down_low_temperature) + push_down_state = false; + else if(temperature > push_down_high_temperature) + push_down_state = true; + if(temperature > high_temperature) + return std::min((1-max_fan)*((temperature-high_temperature)/5.0)+max_fan, 1.0); - double max_temp = std::numeric_limits::min(); - for(const Sensor& sensor : sensors) - { - if(sensor.name == monitored_sensor_name) - { - for(std::pair& gpu : gpus) - { - if(sensor.chip == gpu.first) - { - if(max_temp < sensor.reading) - max_temp = sensor.reading; - gpu.second = true; - } - } - } - } - for(std::pair& gpu : gpus) - { - if(!gpu.second) - { - std::cerr<<"Could not get temperature from "<& sensors) +void ipmi_cleanup(ipmi_ctx_t raw_ctx) { - Sensor cpu("IPMI", "CPU Temp"); - Sensor system("IPMI", "System Temp"); - bool hitCpu = false; - bool hitSystem = false; - std::vector out; - - for(const Sensor& sensor : sensors) - { - if(cpu == sensor) - { - hitCpu = true; - cpu = sensor; - } - else if(sensor == system) - { - hitSystem = true; - system = sensor; - } - } - - if(hitCpu && hitSystem) - { - double fanSystem = fan_curve(system.reading, 0.33, 1.0, 40, 65); - double fanCpu = fan_curve(cpu.reading, 0.33, 1.0, 40, 70); - - return std::max(fanSystem, fanCpu); - } - else - { - std::cerr<<"Could not get temperature from System or Cpu! Ramping fans to maximum\n"; - return 1; - } -} - -std::vector get_fan_zones(const std::vector& sensors) -{ - std::vector out; - out.push_back(system_fan_zone(sensors)); - out.push_back(gpu_fan_zone(sensors)); - return out; + ipmi_set_fan_group(raw_ctx, 0, 1); + ipmi_set_fan_group(raw_ctx, 1, 1); + ipmi_ctx_close(raw_ctx); + ipmi_ctx_destroy(raw_ctx); } int main_loop() @@ -149,7 +104,7 @@ int main_loop() ipmi_ctx_t raw_ctx = ipmi_open_context(); if(!raw_ctx) { - sensors_cleanup(); + std::cerr<<"Unable to connect to impi\n"; return 1; } @@ -157,10 +112,7 @@ int main_loop() if(ret < 0) { std::cerr<<"Could not init lm_sensors\n"; - ipmi_set_fan_group(raw_ctx, 0, 1); - ipmi_set_fan_group(raw_ctx, 1, 1); - ipmi_ctx_close(raw_ctx); - ipmi_ctx_destroy(raw_ctx); + ipmi_cleanup(raw_ctx); return 1; } @@ -169,13 +121,14 @@ int main_loop() ipmi_sensors.push_back(Sensor("IPMI", "CPU Temp")); ipmi_sensors.push_back(Sensor("IPMI", "System Temp")); + std::vector lmSensors; + lmSensors.push_back(Sensor("amdgpu-pci-0300", "edge")); + lmSensors.push_back(Sensor("amdgpu-pci-8300", "edge")); + if(lm_chips.size() < 2) { - std::cerr<<"Could not get both monitored gpus!"; - ipmi_set_fan_group(raw_ctx, 0, 1); - ipmi_set_fan_group(raw_ctx, 1, 1); - ipmi_ctx_close(raw_ctx); - ipmi_ctx_destroy(raw_ctx); + std::cerr<<"Could not get enough monitored gpus!\n"; + ipmi_cleanup(raw_ctx); sensors_cleanup(); return 1; } @@ -183,37 +136,64 @@ int main_loop() ipmi_monitoring_ctx_t monitoring_ctx = init_ipmi_monitoring(); if(!monitoring_ctx) { - ipmi_set_fan_group(raw_ctx, 0, 1); - ipmi_set_fan_group(raw_ctx, 1, 1); - ipmi_ctx_close(raw_ctx); - ipmi_ctx_destroy(raw_ctx); + std::cerr<<"Unable to connect to impi for monitoring\n"; + ipmi_cleanup(raw_ctx); sensors_cleanup(); return 1; } + struct fandevice fdevice; + ret = fandevice_connect(&fdevice, 0); + if(ret < 0) + { + std::cerr<<"Unable to connect to FanDevice\n"; + ipmi_cleanup(raw_ctx); + sensors_cleanup(); + return 1; + } + std::vector fans; + fans.push_back(new IpmiFan(raw_ctx, 0, "IPMI CPU FAN")); + fans.push_back(new IpmiFan(raw_ctx, 1, "IPMI SYSTEM FAN")); + fans.push_back(new FanDeviceFan(&fdevice, FAN_A, "MI100_1 FAN")); + fans.push_back(new FanDeviceFan(&fdevice, FAN_B, "MI100_2 FAN")); + fans.push_back(new FanDeviceFan(&fdevice, FAN_D, "TOP SYSTEM FAN")); + fans.push_back(new FanDeviceFan(&fdevice, FAN_C, "FRONT SYSTEM FAN")); + + std::array pushDownStates = {true, true}; + + std::vector fanZones; + fanZones.push_back(new FanZone(ipmi_sensors[0], fans[0], [](double in){return fan_curve(in, 0.1, 1, 45, 65, false);}, "CPU FAN ZONE")); + fanZones.push_back(new FanZone({ipmi_sensors[0], ipmi_sensors[1]}, fans[1], [](double in){return fan_curve(in, 0.2, 1, 40, 55, false);}, "SYSTEM FAN ZONE")); + fanZones.push_back(new FanZone({ipmi_sensors[0], ipmi_sensors[1]}, fans[4], [](double in){return fan_curve(in, 0.5, 1, 60, 65, true);}, "TOP FAN ZONE")); + fanZones.push_back(new FanZone({lmSensors[0], lmSensors[1]}, fans[5], [](double in){return fan_curve(in, 0, 1, 60, 80, true);}, "FRONT FAN ZONE")); + fanZones.push_back(new FanZone(lmSensors[1], fans[2], [&pushDownStates](double in){return mi100_fan_curve(in, 0.14, 0.7, 65, 80, 50, 70, pushDownStates[0]);}, "MI100_1 FAN ZONE")); + fanZones.push_back(new FanZone(lmSensors[0], fans[3], [&pushDownStates](double in){return mi100_fan_curve(in, 0.14, 0.7, 65, 80, 50, 70, pushDownStates[1]);}, "MI100_2 FAN ZONE")); while(running) { std::vector sensors = gather_sensors(ipmi_sensors, monitoring_ctx, lm_chips); - std::vector fanzones = get_fan_zones(sensors); if(!quiet) { for(const Sensor& sensor : sensors) - std::cout<<"Sensor "<print(sensors); } - ipmi_set_fan_group(raw_ctx, 0, fanzones[0]); - ipmi_set_fan_group(raw_ctx, 1, fanzones[1]); + for(FanZone* zone : fanZones) + zone->step(sensors); + std::cout<<'\n'; sleep(10); } - ipmi_set_fan_group(raw_ctx, 0, 1); - ipmi_set_fan_group(raw_ctx, 1, 1); - ipmi_ctx_close(raw_ctx); - ipmi_ctx_destroy(raw_ctx); + for(FanZone* zone : fanZones) + delete zone; + for(Fan* fan : fans) + delete fan; + + ipmi_cleanup(raw_ctx); ipmi_monitoring_ctx_destroy(monitoring_ctx); sensors_cleanup(); diff --git a/sensor.h b/sensor.h index 4884c97..849eeb6 100644 --- a/sensor.h +++ b/sensor.h @@ -13,5 +13,5 @@ public: public: Sensor() = default; Sensor(std::string chipI, std::string nameI, int idI = 0): name(nameI), chip(chipI), id(idI) {} - bool operator==(const Sensor& other) {return other.name == name && other.chip == chip;} + bool operator==(const Sensor& other) const {return other.name == name && other.chip == chip;} };