root@server ~/gpu_fryer# enroot start gpu_fryer
Detected GPU #0: "NVIDIA H100 80GB HBM3"
Detected GPU #1: "NVIDIA H100 80GB HBM3"
Detected GPU #2: "NVIDIA H100 80GB HBM3"
Detected GPU #3: "NVIDIA H100 80GB HBM3"
Detected GPU #4: "NVIDIA H100 80GB HBM3"
Detected GPU #5: "NVIDIA H100 80GB HBM3"
Detected GPU #6: "NVIDIA H100 80GB HBM3"
Detected GPU #7: "NVIDIA H100 80GB HBM3"
Using precision(s): BF16
Creating random matrices
Matrices created
GPU #0: Using 72504 MB
GPU #1: Using 72504 MB
GPU #2: Using 72504 MB
GPU #3: Using 72504 MB
GPU #4: Using 72504 MB
GPU #5: Using 72504 MB
GPU #6: Using 72504 MB
GPU #7: Using 72504 MB
761 (836728 Gflops/s) - 0 (0 Gflops/s) - 0 (0 Gflops/s) - 0 (0 Gflops/s) - 0 (0 Gflops/s) - 0 (0 Gflops/s) - 0 (0 Gflops/s) - 0 (0 Gflops/s) | Temperatures: 46°C - 32°C - 32°C - 29°C - 30°C - 31°C - 32°C - 31°C | Throttling: None - None - None - None - None - None - None - None
693 (761961 Gflops/s) - 0 (0 Gflops/s) - 0 (0 Gflops/s) - 0 (0 Gflops/s) - 0 (0 Gflops/s) - 0 (0 Gflops/s) - 0 (0 Gflops/s) - 0 (0 Gflops/s) | Temperatures: 49°C - 32°C - 32°C - 29°C - 30°C - 31°C - 32°C - 31°C | Throttling: None - None - None - None - None - None - None - None
692 (760862 Gflops/s) - 0 (0 Gflops/s) - 0 (0 Gflops/s) - 0 (0 Gflops/s) - 0 (0 Gflops/s) - 0 (0 Gflops/s) - 0 (0 Gflops/s) - 0 (0 Gflops/s) | Temperatures: 51°C - 32°C - 32°C - 29°C - 30°C - 31°C - 32°C - 31°C | Throttling: None - None - None - None - None - None - None - None
694 (763061 Gflops/s) - 0 (0 Gflops/s) - 0 (0 Gflops/s) - 0 (0 Gflops/s) - 0 (0 Gflops/s) - 0 (0 Gflops/s) - 785 (863116 Gflops/s) - 0 (0 Gflops/s) | Temperatures: 52°C - 32°C - 32°C - 29°C - 30°C - 31°C - 45°C - 31°C | Throttling: None - None - None - None - None - None - None - None
695 (764160 Gflops/s) - 1161 (1276532 Gflops/s) - 894 (982963 Gflops/s) - 1072 (1178676 Gflops/s) - 931 (1023645 Gflops/s) - 0 (0 Gflops/s) - 701 (770757 Gflops/s) - 1097 (1206164 Gflops/s) | Temperatures: 53°C - 47°C - 49°C - 48°C - 45°C - 31°C - 49°C - 50°C | Throttling: None - None - None - None - None - None - None - None
694 (763061 Gflops/s) - 700 (769658 Gflops/s) - 696 (765260 Gflops/s) - 678 (745468 Gflops/s) - 689 (757563 Gflops/s) - 753 (827932 Gflops/s) - 699 (768558 Gflops/s) - 686 (754264 Gflops/s) | Temperatures: 53°C - 51°C - 52°C - 50°C - 46°C - 46°C - 50°C - 51°C | Throttling: None - None - None - None - None - None - None - None
693 (761961 Gflops/s) - 697 (766359 Gflops/s) - 691 (759762 Gflops/s) - 680 (747667 Gflops/s) - 685 (753165 Gflops/s) - 692 (760862 Gflops/s) - 697 (766359 Gflops/s) - 684 (752065 Gflops/s) | Temperatures: 53°C - 52°C - 53°C - 51°C - 47°C - 49°C - 50°C - 52°C | Throttling: None - None - None - None - None - None - None - None
695 (764160 Gflops/s) - 697 (766359 Gflops/s) - 692 (760862 Gflops/s) - 680 (747667 Gflops/s) - 688 (756463 Gflops/s) - 691 (759762 Gflops/s) - 697 (766359 Gflops/s) - 686 (754264 Gflops/s) | Temperatures: 54°C - 54°C - 54°C - 51°C - 47°C - 51°C - 51°C - 53°C | Throttling: None - None - None - None - None - None - None - None
693 (761961 Gflops/s) - 698 (767459 Gflops/s) - 688 (756463 Gflops/s) - 684 (752065 Gflops/s) - 688 (756463 Gflops/s) - 690 (758663 Gflops/s) - 698 (767459 Gflops/s) - 687 (755364 Gflops/s) | Temperatures: 54°C - 55°C - 55°C - 52°C - 48°C - 52°C - 52°C - 54°C | Throttling: None - None - None - None - None - None - None - None
693 (761961 Gflops/s) - 697 (766359 Gflops/s) - 691 (759762 Gflops/s) - 682 (749866 Gflops/s) - 688 (756463 Gflops/s) - 690 (758663 Gflops/s) - 696 (765260 Gflops/s) - 687 (755364 Gflops/s) | Temperatures: 55°C - 55°C - 56°C - 52°C - 49°C - 53°C - 52°C - 55°C | Throttling: None - None - None - None - None - None - None - None
694 (763061 Gflops/s) - 696 (765260 Gflops/s) - 690 (758663 Gflops/s) - 684 (752065 Gflops/s) - 686 (754264 Gflops/s) - 691 (759762 Gflops/s) - 697 (766359 Gflops/s) - 687 (755364 Gflops/s) | Temperatures: 55°C - 56°C - 57°C - 52°C - 49°C - 54°C - 53°C - 56°C | Throttling: None - None - None - None - None - None - None - None
691 (759762 Gflops/s) - 696 (765260 Gflops/s) - 690 (758663 Gflops/s) - 685 (753165 Gflops/s) - 687 (755364 Gflops/s) - 689 (757563 Gflops/s) - 695 (764160 Gflops/s) - 686 (754264 Gflops/s) | Temperatures: 56°C - 57°C - 57°C - 53°C - 50°C - 55°C - 53°C - 57°C | Throttling: None - None - None - None - None - None - None - None
693 (761961 Gflops/s) - 697 (766359 Gflops/s) - 688 (756463 Gflops/s) - 684 (752065 Gflops/s) - 686 (754264 Gflops/s) - 691 (759762 Gflops/s) - 697 (766359 Gflops/s) - 686 (754264 Gflops/s) | Temperatures: 56°C - 58°C - 58°C - 53°C - 50°C - 56°C - 54°C - 57°C | Throttling: None - None - None - None - None - None - None - None
690 (758663 Gflops/s) - 694 (763061 Gflops/s) - 689 (757563 Gflops/s) - 685 (753165 Gflops/s) - 691 (759762 Gflops/s) - 690 (758663 Gflops/s) - 697 (766359 Gflops/s) - 686 (754264 Gflops/s) | Temperatures: 56°C - 58°C - 59°C - 54°C - 50°C - 56°C - 55°C - 57°C | Throttling: None - None - None - None - None - None - None - None
692 (760862 Gflops/s) - 694 (763061 Gflops/s) - 690 (758663 Gflops/s) - 685 (753165 Gflops/s) - 688 (756463 Gflops/s) - 688 (756463 Gflops/s) - 695 (764160 Gflops/s) - 684 (752065 Gflops/s) | Temperatures: 56°C - 59°C - 59°C - 54°C - 51°C - 57°C - 55°C - 57°C | Throttling: None - None - None - None - None - None - None - None
692 (760862 Gflops/s) - 695 (764160 Gflops/s) - 689 (757563 Gflops/s) - 686 (754264 Gflops/s) - 689 (757563 Gflops/s) - 687 (755364 Gflops/s) - 696 (765260 Gflops/s) - 686 (754264 Gflops/s) | Temperatures: 57°C - 59°C - 60°C - 54°C - 51°C - 57°C - 55°C - 58°C | Throttling: None - None - None - None - None - None - None - None
692 (760862 Gflops/s) - 695 (764160 Gflops/s) - 689 (757563 Gflops/s) - 685 (753165 Gflops/s) - 688 (756463 Gflops/s) - 688 (756463 Gflops/s) - 696 (765260 Gflops/s) - 685 (753165 Gflops/s) | Temperatures: 57°C - 60°C - 60°C - 55°C - 51°C - 58°C - 56°C - 58°C | Throttling: None - None - None - None - None - None - None - None
690 (758663 Gflops/s) - 695 (764160 Gflops/s) - 688 (756463 Gflops/s) - 683 (750966 Gflops/s) - 687 (755364 Gflops/s) - 687 (755364 Gflops/s) - 693 (761961 Gflops/s) - 684 (752065 Gflops/s) | Temperatures: 57°C - 60°C - 61°C - 55°C - 52°C - 58°C - 56°C - 58°C | Throttling: None - None - None - None - None - None - None - None
691 (759762 Gflops/s) - 693 (761961 Gflops/s) - 687 (755364 Gflops/s) - 685 (753165 Gflops/s) - 688 (756463 Gflops/s) - 686 (754264 Gflops/s) - 696 (765260 Gflops/s) - 685 (753165 Gflops/s) | Temperatures: 58°C - 60°C - 61°C - 55°C - 51°C - 58°C - 56°C - 59°C | Throttling: None - None - None - None - None - None - None - None
694 (763061 Gflops/s) - 694 (763061 Gflops/s) - 685 (753165 Gflops/s) - 685 (753165 Gflops/s) - 687 (755364 Gflops/s) - 687 (755364 Gflops/s) - 694 (763061 Gflops/s) - 685 (753165 Gflops/s) | Temperatures: 58°C - 60°C - 60°C - 55°C - 52°C - 59°C - 57°C - 59°C | Throttling: None - None - None - None - None - None - None - None
692 (760862 Gflops/s) - 693 (761961 Gflops/s) - 687 (755364 Gflops/s) - 685 (753165 Gflops/s) - 686 (754264 Gflops/s) - 685 (753165 Gflops/s) - 693 (761961 Gflops/s) - 684 (752065 Gflops/s) | Temperatures: 58°C - 61°C - 61°C - 55°C - 52°C - 59°C - 57°C - 59°C | Throttling: None - None - None - None - None - None - None - None
691 (759762 Gflops/s) - 695 (764160 Gflops/s) - 687 (755364 Gflops/s) - 685 (753165 Gflops/s) - 686 (754264 Gflops/s) - 687 (755364 Gflops/s) - 694 (763061 Gflops/s) - 684 (752065 Gflops/s) | Temperatures: 58°C - 61°C - 62°C - 56°C - 53°C - 60°C - 57°C - 59°C | Throttling: None - None - None - None - None - None - None - None
691 (759762 Gflops/s) - 694 (763061 Gflops/s) - 687 (755364 Gflops/s) - 685 (753165 Gflops/s) - 686 (754264 Gflops/s) - 686 (754264 Gflops/s) - 695 (764160 Gflops/s) - 684 (752065 Gflops/s) | Temperatures: 58°C - 62°C - 62°C - 56°C - 53°C - 60°C - 58°C - 59°C | Throttling: None - None - None - None - None - None - None - None
691 (759762 Gflops/s) - 692 (760862 Gflops/s) - 687 (755364 Gflops/s) - 683 (750966 Gflops/s) - 687 (755364 Gflops/s) - 684 (752065 Gflops/s) - 692 (760862 Gflops/s) - 683 (750966 Gflops/s) | Temperatures: 58°C - 62°C - 62°C - 56°C - 53°C - 60°C - 58°C - 59°C | Throttling: None - None - None - None - None - None - None - None
693 (761961 Gflops/s) - 694 (763061 Gflops/s) - 688 (756463 Gflops/s) - 685 (753165 Gflops/s) - 687 (755364 Gflops/s) - 686 (754264 Gflops/s) - 693 (761961 Gflops/s) - 684 (752065 Gflops/s) | Temperatures: 59°C - 62°C - 62°C - 56°C - 53°C - 61°C - 58°C - 60°C | Throttling: None - None - None - None - None - None - None - None
691 (759762 Gflops/s) - 693 (761961 Gflops/s) - 689 (757563 Gflops/s) - 685 (753165 Gflops/s) - 688 (756463 Gflops/s) - 685 (753165 Gflops/s) - 694 (763061 Gflops/s) - 684 (752065 Gflops/s) | Temperatures: 59°C - 62°C - 63°C - 56°C - 53°C - 61°C - 59°C - 60°C | Throttling: None - None - None - None - None - None - None - None
691 (759762 Gflops/s) - 693 (761961 Gflops/s) - 690 (758663 Gflops/s) - 685 (753165 Gflops/s) - 685 (753165 Gflops/s) - 685 (753165 Gflops/s) - 694 (763061 Gflops/s) - 683 (750966 Gflops/s) | Temperatures: 59°C - 63°C - 63°C - 56°C - 54°C - 61°C - 59°C - 60°C | Throttling: None - None - None - None - None - None - None - None
692 (760862 Gflops/s) - 697 (766359 Gflops/s) - 691 (759762 Gflops/s) - 685 (753165 Gflops/s) - 687 (755364 Gflops/s) - 685 (753165 Gflops/s) - 696 (765260 Gflops/s) - 684 (752065 Gflops/s) | Temperatures: 59°C - 63°C - 64°C - 56°C - 54°C - 61°C - 59°C - 60°C | Throttling: None - None - None - None - None - None - None - None
692 (760862 Gflops/s) - 699 (768558 Gflops/s) - 689 (757563 Gflops/s) - 686 (754264 Gflops/s) - 687 (755364 Gflops/s) - 685 (753165 Gflops/s) - 695 (764160 Gflops/s) - 683 (750966 Gflops/s) | Temperatures: 59°C - 63°C - 64°C - 57°C - 54°C - 62°C - 60°C - 61°C | Throttling: None - None - None - None - None - None - None - None
692 (760862 Gflops/s) - 700 (769658 Gflops/s) - 692 (760862 Gflops/s) - 685 (753165 Gflops/s) - 687 (755364 Gflops/s) - 686 (754264 Gflops/s) - 694 (763061 Gflops/s) - 683 (750966 Gflops/s) | Temperatures: 59°C - 63°C - 64°C - 57°C - 54°C - 62°C - 60°C - 61°C | Throttling: None - None - None - None - None - None - None - None
692 (760862 Gflops/s) - 698 (767459 Gflops/s) - 693 (761961 Gflops/s) - 682 (749866 Gflops/s) - 686 (754264 Gflops/s) - 685 (753165 Gflops/s) - 694 (763061 Gflops/s) - 683 (750966 Gflops/s) | Temperatures: 59°C - 63°C - 64°C - 57°C - 54°C - 62°C - 60°C - 61°C | Throttling: None - None - None - None - None - None - None - None
692 (760862 Gflops/s) - 699 (768558 Gflops/s) - 694 (763061 Gflops/s) - 683 (750966 Gflops/s) - 687 (755364 Gflops/s) - 686 (754264 Gflops/s) - 696 (765260 Gflops/s) - 683 (750966 Gflops/s) | Temperatures: 59°C - 63°C - 64°C - 57°C - 55°C - 63°C - 60°C - 61°C | Throttling: None - None - None - None - None - None - None - None
692 (760862 Gflops/s) - 700 (769658 Gflops/s) - 694 (763061 Gflops/s) - 683 (750966 Gflops/s) - 686 (754264 Gflops/s) - 689 (757563 Gflops/s) - 696 (765260 Gflops/s) - 683 (750966 Gflops/s) | Temperatures: 59°C - 64°C - 65°C - 57°C - 55°C - 63°C - 60°C - 61°C | Throttling: None - None - None - None - None - None - None - None
691 (759762 Gflops/s) - 699 (768558 Gflops/s) - 693 (761961 Gflops/s) - 684 (752065 Gflops/s) - 687 (755364 Gflops/s) - 689 (757563 Gflops/s) - 696 (765260 Gflops/s) - 683 (750966 Gflops/s) | Temperatures: 60°C - 64°C - 65°C - 58°C - 54°C - 63°C - 60°C - 61°C | Throttling: None - None - None - None - None - None - None - None
691 (759762 Gflops/s) - 698 (767459 Gflops/s) - 693 (761961 Gflops/s) - 684 (752065 Gflops/s) - 685 (753165 Gflops/s) - 689 (757563 Gflops/s) - 694 (763061 Gflops/s) - 682 (749866 Gflops/s) | Temperatures: 60°C - 64°C - 65°C - 58°C - 55°C - 63°C - 61°C - 61°C | Throttling: None - None - None - None - None - None - None - None
692 (760862 Gflops/s) - 698 (767459 Gflops/s) - 694 (763061 Gflops/s) - 682 (749866 Gflops/s) - 685 (753165 Gflops/s) - 689 (757563 Gflops/s) - 695 (764160 Gflops/s) - 683 (750966 Gflops/s) | Temperatures: 60°C - 64°C - 65°C - 58°C - 55°C - 63°C - 61°C - 61°C | Throttling: None - None - None - None - None - None - None - None
692 (760862 Gflops/s) - 699 (768558 Gflops/s) - 693 (761961 Gflops/s) - 683 (750966 Gflops/s) - 687 (755364 Gflops/s) - 690 (758663 Gflops/s) - 695 (764160 Gflops/s) - 683 (750966 Gflops/s) | Temperatures: 60°C - 64°C - 65°C - 58°C - 55°C - 63°C - 61°C - 61°C | Throttling: None - None - None - None - None - None - None - None
691 (759762 Gflops/s) - 699 (768558 Gflops/s) - 694 (763061 Gflops/s) - 683 (750966 Gflops/s) - 686 (754264 Gflops/s) - 689 (757563 Gflops/s) - 694 (763061 Gflops/s) - 682 (749866 Gflops/s) | Temperatures: 60°C - 65°C - 65°C - 58°C - 55°C - 64°C - 61°C - 62°C | Throttling: None - None - None - None - None - None - None - None
Killed
no error stacktrace.
please advice.
| Temperatures: 52°C - 57°C - 57°C - 49°C - 48°C - 56°C - 55°C - 53°C | Throttling: None - None - None - None - None - None - None - None
1228 (1350200 Gflops/s) - 1237 (1360095 Gflops/s) - 1224 (1345802 Gflops/s) - 1234 (1356797 Gflops/s) - 1234 (1356797 Gflops/s) - 1219 (1340304 Gflops/s) - 1237 (1360095 Gflops/s) - 1234 (1356797 Gflops/s) | Temperatures: 52°C - 57°C - 57°C - 49°C - 48°C - 56°C - 56°C - 53°C | Throttling: None - None - None - None - None - None - None - None
GPU #0: 1355084 Gflops/s (min: 1348001.26, max: 1358996.37, dev: 1355084.16)
Temperature: 51.07°C (min: 48.00, max: 52.00)
Throttling HW: false, Thermal SW: false, Thermal HW: false
GPU #1: 1360352 Gflops/s (min: 1357896.86, max: 1368891.98, dev: 1360351.58)
Temperature: 54.37°C (min: 50.00, max: 57.00)
Throttling HW: false, Thermal SW: false, Thermal HW: false
GPU #2: 1347618 Gflops/s (min: 1344702.72, max: 1354598.33, dev: 1347617.71)
Temperature: 54.72°C (min: 50.00, max: 57.00)
Throttling HW: false, Thermal SW: false, Thermal HW: false
GPU #3: 1358996 Gflops/s (min: 1355697.84, max: 1365593.44, dev: 1358996.37)
Temperature: 47.37°C (min: 42.00, max: 49.00)
Throttling HW: false, Thermal SW: false, Thermal HW: false
GPU #4: 1358076 Gflops/s (min: 1346901.74, max: 1362294.91, dev: 1358075.85)
Temperature: 46.35°C (min: 42.00, max: 48.00)
Throttling HW: false, Thermal SW: false, Thermal HW: false
GPU #5: 1346262 Gflops/s (min: 1340304.67, max: 1355697.84, dev: 1346262.49)
Temperature: 53.56°C (min: 47.00, max: 56.00)
Throttling HW: false, Thermal SW: false, Thermal HW: false
GPU #6: 1368304 Gflops/s (min: 1355697.84, max: 1645968.91, dev: 1368303.87)
Temperature: 52.49°C (min: 46.00, max: 56.00)
Throttling HW: false, Thermal SW: false, Thermal HW: false
GPU #7: 1362525 Gflops/s (min: 1348001.26, max: 1621779.65, dev: 1362525.04)
Temperature: 51.00°C (min: 45.00, max: 53.00)
Throttling HW: false, Thermal SW: false, Thermal HW: false
All GPUs seem healthy
Freeing GPUs...
Hi Team
While trying to run gpu fryer using enroot, the run abruptly stops with "Killed" message,
OS: Ubuntu 22.04.5 LTS
Kernel: 5.15.0-1063-nvidia
Here are the logs:
no error stacktrace.
please advice.
UPDATE:
i reattempted using:
enroot start gpu_fryer --use-fp8
and i see: