Skip to content

Commit a8b627a

Browse files
poetteringkeszybz
authored andcommitted
main: bump fs.nr_open + fs.max-file to their largest possible values
After discussions with kernel folks, a system with memcg really shouldn't need extra hard limits on file descriptors anymore, as they are properly accounted for by memcg anyway. Hence, let's bump these values to their maximums. This also adds a build time option to turn thiss off, to cover those users who do not want to use memcg.
1 parent 52d363e commit a8b627a

File tree

4 files changed

+102
-0
lines changed

4 files changed

+102
-0
lines changed

NEWS

+11
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,17 @@ CHANGES WITH 240 in spe:
5252
anymore (and neither can any shared library they use — or any shared
5353
library used by any shared library they use and so on).
5454

55+
* The fs.nr_open and fs.file-max sysctls are now automatically bumped
56+
to the highest possible values, as separate accounting of file
57+
descriptors is no longer necessary, as memcg tracks them correctly as
58+
part of the memory accounting anyway. Thus, from the four limits on
59+
file descriptors currently enforced (fs.file-max, fs.nr_open,
60+
RLIMIT_NOFILE hard, RLIMIT_NOFILE soft) we turn off the first two,
61+
and keep only the latter two. A set of build-time options
62+
(-Dbump-proc-sys-fs-file-max=no and -Dbump-proc-sys-fs-nr-open=no)
63+
has been added to revert this change in behaviour, which might be
64+
an option for systems that turn off memcg in the kernel.
65+
5566
CHANGES WITH 239:
5667

5768
* NETWORK INTERFACE DEVICE NAMING CHANGES: systemd-udevd's "net_id"

meson.build

+3
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,9 @@ sysvrcnd_path = get_option('sysvrcnd-path')
7373
conf.set10('HAVE_SYSV_COMPAT', sysvinit_path != '' and sysvrcnd_path != '',
7474
description : 'SysV init scripts and rcN.d links are supported')
7575

76+
conf.set10('BUMP_PROC_SYS_FS_FILE_MAX', get_option('bump-proc-sys-fs-file-max'))
77+
conf.set10('BUMP_PROC_SYS_FS_NR_OPEN', get_option('bump-proc-sys-fs-nr-open'))
78+
7679
# join_paths ignore the preceding arguments if an absolute component is
7780
# encountered, so this should canonicalize various paths when they are
7881
# absolute or relative.

meson_options.txt

+4
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,10 @@ option('debug-extra', type : 'array', choices : ['hashmap', 'mmap-cache'], value
4949
description : 'enable extra debugging')
5050
option('memory-accounting-default', type : 'boolean',
5151
description : 'enable MemoryAccounting= by default')
52+
option('bump-proc-sys-fs-file-max', type : 'boolean',
53+
description : 'bump /proc/sys/fs/file-max to ULONG_MAX')
54+
option('bump-proc-sys-fs-nr-open', type : 'boolean',
55+
description : 'bump /proc/sys/fs/nr_open to INT_MAX')
5256
option('valgrind', type : 'boolean', value : false,
5357
description : 'do extra operations to avoid valgrind warnings')
5458
option('log-trace', type : 'boolean', value : false,

src/core/main.c

+84
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@
7373
#include "stdio-util.h"
7474
#include "strv.h"
7575
#include "switch-root.h"
76+
#include "sysctl-util.h"
7677
#include "terminal-util.h"
7778
#include "umask-util.h"
7879
#include "user-util.h"
@@ -1162,6 +1163,88 @@ static int prepare_reexecute(
11621163
return 0;
11631164
}
11641165

1166+
static void bump_file_max_and_nr_open(void) {
1167+
1168+
/* Let's bump fs.file-max and fs.nr_open to their respective maximums. On current kernels large numbers of file
1169+
* descriptors are no longer a performance problem and their memory is properly tracked by memcg, thus counting
1170+
* them and limiting them in another two layers of limits is unnecessary and just complicates things. This
1171+
* function hence turns off 2 of the 4 levels of limits on file descriptors, and makes RLIMIT_NOLIMIT (soft +
1172+
* hard) the only ones that really matter. */
1173+
1174+
#if BUMP_PROC_SYS_FS_FILE_MAX || BUMP_PROC_SYS_FS_NR_OPEN
1175+
_cleanup_free_ char *t = NULL;
1176+
int r;
1177+
#endif
1178+
1179+
#if BUMP_PROC_SYS_FS_FILE_MAX
1180+
/* I so wanted to use STRINGIFY(ULONG_MAX) here, but alas we can't as glibc/gcc define that as
1181+
* "(0x7fffffffffffffffL * 2UL + 1UL)". Seriously. 😢 */
1182+
if (asprintf(&t, "%lu\n", ULONG_MAX) < 0) {
1183+
log_oom();
1184+
return;
1185+
}
1186+
1187+
r = sysctl_write("fs/file-max", t);
1188+
if (r < 0)
1189+
log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, "Failed to bump fs.file-max, ignoring: %m");
1190+
#endif
1191+
1192+
#if BUMP_PROC_SYS_FS_FILE_MAX && BUMP_PROC_SYS_FS_NR_OPEN
1193+
t = mfree(t);
1194+
#endif
1195+
1196+
#if BUMP_PROC_SYS_FS_NR_OPEN
1197+
int v = INT_MAX;
1198+
1199+
/* Arg! The kernel enforces maximum and minimum values on the fs.nr_open, but we don't really know what they
1200+
* are. The expression by which the maximum is determined is dependent on the architecture, and is something we
1201+
* don't really want to copy to userspace, as it is dependent on implementation details of the kernel. Since
1202+
* the kernel doesn't expose the maximum value to us, we can only try and hope. Hence, let's start with
1203+
* INT_MAX, and then keep halving the value until we find one that works. Ugly? Yes, absolutely, but kernel
1204+
* APIs are kernel APIs, so what do can we do... 🤯 */
1205+
1206+
for (;;) {
1207+
int k;
1208+
1209+
v &= ~(__SIZEOF_POINTER__ - 1); /* Round down to next multiple of the pointer size */
1210+
if (v < 1024) {
1211+
log_warning("Can't bump fs.nr_open, value too small.");
1212+
break;
1213+
}
1214+
1215+
k = read_nr_open();
1216+
if (k < 0) {
1217+
log_error_errno(k, "Failed to read fs.nr_open: %m");
1218+
break;
1219+
}
1220+
if (k >= v) { /* Already larger */
1221+
log_debug("Skipping bump, value is already larger.");
1222+
break;
1223+
}
1224+
1225+
if (asprintf(&t, "%i\n", v) < 0) {
1226+
log_oom();
1227+
return;
1228+
}
1229+
1230+
r = sysctl_write("fs/nr_open", t);
1231+
t = mfree(t);
1232+
if (r == -EINVAL) {
1233+
log_debug("Couldn't write fs.nr_open as %i, halving it.", v);
1234+
v /= 2;
1235+
continue;
1236+
}
1237+
if (r < 0) {
1238+
log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, "Failed to bump fs.nr_open, ignoring: %m");
1239+
break;
1240+
}
1241+
1242+
log_debug("Successfully bumped fs.nr_open to %i", v);
1243+
break;
1244+
}
1245+
#endif
1246+
}
1247+
11651248
static int bump_rlimit_nofile(struct rlimit *saved_rlimit) {
11661249
int r, nr;
11671250

@@ -1883,6 +1966,7 @@ static int initialize_runtime(
18831966
machine_id_setup(NULL, arg_machine_id, NULL);
18841967
loopback_setup();
18851968
bump_unix_max_dgram_qlen();
1969+
bump_file_max_and_nr_open();
18861970
test_usr();
18871971
write_container_id();
18881972
}

0 commit comments

Comments
 (0)