diff --git a/fw/cfg.c b/fw/cfg.c index 1a73dab42..a15d3515f 100644 --- a/fw/cfg.c +++ b/fw/cfg.c @@ -1826,27 +1826,38 @@ tfw_cfg_set_str(TfwCfgSpec *cs, TfwCfgEntry *e) } int -tfw_cfg_set_mem(TfwCfgSpec *cs, TfwCfgEntry *e) +tfw_cfg_set_mem_val(TfwCfgSpec *cs, TfwCfgEntry *e, + unsigned int val_num, unsigned long *dst) { - long *dest_long; - unsigned long val; + unsigned int n = val_num; TfwCfgSpecMem *cse; - BUG_ON(!cs->dest); - - if (tfw_cfg_check_single_val(e)) - return -EINVAL; - /* Check value restrictions if we have any in the spec extension. */ cse = cs->spec_ext; if (cse) { - if (tfw_cfg_mem_check_multiple_of(e->vals[0], cse->multiple_of) - || tfw_cfg_mem_check_range(e->vals[0], cse->range.min, + if (tfw_cfg_mem_check_multiple_of(e->vals[n], cse->multiple_of) + || tfw_cfg_mem_check_range(e->vals[n], cse->range.min, cse->range.max)) return -EINVAL; } - val = memparse(e->vals[0], NULL); + *dst = memparse(e->vals[n], NULL); + + return 0; +} + +int +tfw_cfg_set_mem(TfwCfgSpec *cs, TfwCfgEntry *e) +{ + long *dest_long; + unsigned long val; + int r; + + BUG_ON(!cs->dest); + + r = tfw_cfg_set_mem_val(cs, e, 0, &val); + if (r) + return r; dest_long = cs->dest; *dest_long = val; diff --git a/fw/cfg.h b/fw/cfg.h index 8f1682320..722899b62 100644 --- a/fw/cfg.h +++ b/fw/cfg.h @@ -489,6 +489,8 @@ int tfw_cfg_set_int(TfwCfgSpec *spec, TfwCfgEntry *parsed_entry); int tfw_cfg_set_long(TfwCfgSpec *spec, TfwCfgEntry *parsed_entry); int tfw_cfg_set_str(TfwCfgSpec *spec, TfwCfgEntry *parsed_entry); int tfw_cfg_set_mem(TfwCfgSpec *spec, TfwCfgEntry *parsed_entry); +int tfw_cfg_set_mem_val(TfwCfgSpec *spec, TfwCfgEntry *parsed_entry, + unsigned int val_num, unsigned long *dst); int tfw_cfg_handle_children(TfwCfgSpec *self, TfwCfgEntry *parsed_entry); void tfw_cfg_cleanup_children(TfwCfgSpec *cs); diff --git a/fw/client.c b/fw/client.c index e6b9eb9ce..56f28e0f5 100644 --- a/fw/client.c +++ b/fw/client.c @@ -71,18 +71,13 @@ static TDB *client_db; static atomic_t shutdown_pending = ATOMIC_INIT(0); static DECLARE_WAIT_QUEUE_HEAD(shutdown_wq); -static struct kmem_cache *cli_mem_cache; +static struct kmem_cache *tfw_cli_mem_cache; static struct { TfwClientMem *mem; - struct list_head free_list; + TfwClientMem *free_list; unsigned int size; unsigned int order; -} cli_mem_pool = { - .mem = NULL, - .free_list = LIST_HEAD_INIT(cli_mem_pool.free_list), - .size = 0, - .order = 0, -}; +} cli_mem_pool; static inline bool tfw_cli_mem_belongs_to_pool(TfwClientMem *cli_mem) @@ -97,9 +92,13 @@ __cli_mem_release(TfwClientMem *cli_mem) percpu_ref_exit(&cli_mem->refcnt); free_percpu(cli_mem->mem); if (!tfw_cli_mem_belongs_to_pool(cli_mem)) - kmem_cache_free(cli_mem_cache, cli_mem); + kmem_cache_free(tfw_cli_mem_cache, cli_mem); } +/* + * Reset counters, reinit refcnt and put `cli_mem` back to the pool. + * Sohuld be called under `ga_lock`, to protect `cli_mem_pool.free_list` + */ static inline void tfw_cli_mem_pool_free(TfwClientMem *cli_mem) { @@ -110,9 +109,34 @@ tfw_cli_mem_pool_free(TfwClientMem *cli_mem) for_each_online_cpu(cpu) *per_cpu_ptr(cli_mem->mem, cpu) = 0; percpu_ref_reinit(&cli_mem->refcnt); - list_add_tail(&cli_mem->in_free_list, &cli_mem_pool.free_list); + cli_mem->next_free = cli_mem_pool.free_list; + cli_mem_pool.free_list = cli_mem; } +/* + * Workqueue handler for asynchronous cli_mem destruction. + * + * This function initiates final teardown of a TfwClientMem object: + * - percpu_ref_kill() marks the refcount as dead, preventing any new + * users from acquiring references. + * - percpu_ref_put() drops the caller’s reference, which may trigger + * final release via cli_mem_release() once all outstanding users + * are gone. + */ +static void +tfw_cli_mem_kill_work_fn(struct work_struct *work) +{ + TfwClientMem *cli_mem = container_of(work, TfwClientMem, kill_work); + + percpu_ref_kill(&cli_mem->refcnt); + percpu_ref_put(&cli_mem->refcnt); +} + +/* + * Get `TfwClientMem` object from pool if present. + * Object was already initialized during pool creation or + * releasing to pool. + */ static inline TfwClientMem * tfw_cli_mem_pool_alloc(void) { @@ -120,16 +144,25 @@ tfw_cli_mem_pool_alloc(void) assert_spin_locked(&client_db->ga_lock); - cli_mem = list_first_entry_or_null(&cli_mem_pool.free_list, - TfwClientMem, in_free_list); - if (!cli_mem) + if (!cli_mem_pool.free_list) return NULL; - list_del_init(&cli_mem->in_free_list); + cli_mem = cli_mem_pool.free_list; + cli_mem_pool.free_list = cli_mem->next_free; + /* + * Should be called only after `free_list` initialization + * using `next_free` pointer, because `next_free` and + * `kill_work` members belong to the same union. + */ + INIT_WORK(&cli_mem->kill_work, tfw_cli_mem_kill_work_fn); return cli_mem; } +/* + * Final release of cli_mem: verify refcnt/memory are zero and either + * return to pool or free it. Signals shutdown completion if needed. + */ static void cli_mem_release(struct percpu_ref *ref) { @@ -138,6 +171,7 @@ cli_mem_release(struct percpu_ref *ref) spin_lock_bh(&client_db->ga_lock); WARN_ON_ONCE(!percpu_ref_is_zero(ref)); + WARN_ON_ONCE(tfw_client_mem(cli_mem)); if (tfw_cli_mem_belongs_to_pool(cli_mem)) tfw_cli_mem_pool_free(cli_mem); else @@ -149,15 +183,6 @@ cli_mem_release(struct percpu_ref *ref) wake_up(&shutdown_wq); } -static void -tfw_cli_mem_kill_work_fn(struct work_struct *work) -{ - TfwClientMem *cli_mem = container_of(work, TfwClientMem, kill_work); - - percpu_ref_kill(&cli_mem->refcnt); - percpu_ref_put(&cli_mem->refcnt); -} - static inline int tfw_cli_mem_init(TfwClientMem *cli_mem, gfp_t flags) { @@ -172,13 +197,11 @@ tfw_cli_mem_init(TfwClientMem *cli_mem, gfp_t flags) if (unlikely(r)) goto free_per_cpu_mem; - INIT_LIST_HEAD(&cli_mem->in_free_list); - INIT_WORK(&cli_mem->kill_work, tfw_cli_mem_kill_work_fn); - return 0; free_per_cpu_mem: free_percpu(cli_mem->mem); + cli_mem->mem = NULL; return r; } @@ -186,25 +209,39 @@ tfw_cli_mem_init(TfwClientMem *cli_mem, gfp_t flags) static inline void tfw_cli_mem_pool_exit(void) { - TfwClientMem *curr, *tmp; + TfwClientMem *tmp, *curr = cli_mem_pool.free_list; - list_for_each_entry_safe(curr, tmp, &cli_mem_pool.free_list, - in_free_list) - { - list_del_init(&curr->in_free_list); - __cli_mem_release(curr); + while (curr) { + tmp = curr; + curr = tmp->next_free; + __cli_mem_release(tmp); } free_pages((unsigned long)cli_mem_pool.mem, cli_mem_pool.order); - cli_mem_pool.mem = NULL; + bzero_fast(&cli_mem_pool, sizeof(cli_mem_pool)); } +/* + * Initialize cli_mem pool. + * + * Allocates a contiguous block of TfwClientMem objects and initializes each + * element, then builds a free list for fast allocation. + * + * Steps: + * - Validate pool size from configuration. + * - Compute allocation order and clamp it to MAX_PAGE_ORDER. + * - Allocate zeroed pages for the entire pool. + * - Initialize each TfwClientMem (per-cpu counters + refcnt + work). + * - Link all objects into a singly-linked free list. + * + * Provide fast allocations of `TfwClientMem` later. + */ static inline int tfw_cli_mem_pool_init(void) { - TfwClientMem *block; - unsigned int order; - int i, r; + TfwClientMem *block, *tail = NULL; + unsigned int i, order; + int r; if (WARN_ON_ONCE(!client_cfg.lru_size)) return -EINVAL; @@ -219,12 +256,31 @@ tfw_cli_mem_pool_init(void) if (unlikely(!cli_mem_pool.mem)) return -ENOMEM; + /* + * Initialize pool in forward order and build free_list as + * 0 -> 1 -> ... -> N-1. + * + * This preserves the natural memory layout of the preallocated array, + * which is important because tfw_cli_mem_belongs_to_pool() relies on + * the pool being a contiguous range [mem, mem + size). + * + * Using tail insertion avoids reversing the order (which would happen + * with head insertion) and keeps allocation predictable and + * cache-friendly. + */ block = cli_mem_pool.mem; for (i = 0; i < client_cfg.lru_size; i++) { r = tfw_cli_mem_init(&block[i], GFP_KERNEL); if (unlikely(r)) return r; - list_add(&block[i].in_free_list, &cli_mem_pool.free_list); + + if (!cli_mem_pool.free_list) + cli_mem_pool.free_list = &block[i]; + else + tail->next_free = &block[i]; + + block[i].next_free = NULL; + tail = &block[i]; cli_mem_pool.size++; } @@ -350,26 +406,36 @@ tfw_client_addr_eq(TdbRec *rec, void *data) return true; } +/* + * Allocate cli_mem from slab cache and fully initialize it. + * Used as a fallback when pool allocation is exhausted. + */ static inline TfwClientMem * tfw_cli_mem_alloc_from_cache(void) { TfwClientMem *cli_mem; - cli_mem = kmem_cache_alloc(cli_mem_cache, GFP_ATOMIC); + cli_mem = kmem_cache_alloc(tfw_cli_mem_cache, GFP_ATOMIC); if (unlikely(!cli_mem)) return NULL; if (unlikely(tfw_cli_mem_init(cli_mem, GFP_ATOMIC))) goto free_cli_mem; + INIT_WORK(&cli_mem->kill_work, tfw_cli_mem_kill_work_fn); return cli_mem; free_cli_mem: - kmem_cache_free(cli_mem_cache, cli_mem); + kmem_cache_free(tfw_cli_mem_cache, cli_mem); return NULL; } +/* + * Allocate cli_mem: + * - Try fast pool first, then fallback to slab cache. + * - On success, take an extra refcnt reference before returning. + */ static inline TfwClientMem * tfw_cli_mem_alloc(void) { @@ -589,10 +655,10 @@ TfwMod tfw_client_mod = { int __init tfw_client_init(void) { - cli_mem_cache = kmem_cache_create("cli_mem_cache", - sizeof(TfwClientMem), - 0, 0, NULL); - if (!cli_mem_cache) + tfw_cli_mem_cache = kmem_cache_create("tfw_cli_mem_cache", + sizeof(TfwClientMem), + 0, 0, NULL); + if (!tfw_cli_mem_cache) return -ENOMEM; tfw_mod_register(&tfw_client_mod); @@ -602,6 +668,6 @@ tfw_client_init(void) void tfw_client_exit(void) { - kmem_cache_destroy(cli_mem_cache); + kmem_cache_destroy(tfw_cli_mem_cache); tfw_mod_unregister(&tfw_client_mod); } diff --git a/fw/client.h b/fw/client.h index d14409ba2..a280b98dc 100644 --- a/fw/client.h +++ b/fw/client.h @@ -24,11 +24,24 @@ #include "http_limits.h" #include "connection.h" +/* + * Client memory accounting structure for Tempesta FW. + * + * @kill_work - Workqueue item used for asynchronous structure + * cleanup/destruction; + * @next_free - Pointer to the next free object in the freelist; + * @refcnt - Per-CPU reference counter. Provides scalable and + * thread-safe reference tracking on SMP systems with + * minimal contention; + * @mem - Per-CPU memory accounting storage. + */ typedef struct tfw_client_mem_t { + union { + struct work_struct kill_work; + struct tfw_client_mem_t *next_free; + }; struct percpu_ref refcnt; - struct work_struct kill_work; long __percpu *mem; - struct list_head in_free_list; } TfwClientMem; /** @@ -82,13 +95,13 @@ tfw_client_mem_put(TfwClientMem *cli_mem) } static inline long -tfw_client_mem(TfwClient *cli) +tfw_client_mem(TfwClientMem *cli_mem) { long mem = 0; int cpu; for_each_online_cpu(cpu) - mem += *(per_cpu_ptr(cli->cli_mem->mem, cpu)); + mem += *(per_cpu_ptr(cli_mem->mem, cpu)); return mem; } diff --git a/fw/connection.h b/fw/connection.h index ee0d6d378..0cc44c654 100644 --- a/fw/connection.h +++ b/fw/connection.h @@ -581,6 +581,7 @@ tfw_connection_validate_cleanup(TfwConn *conn) BUG_ON(!conn); BUG_ON(!list_empty(&conn->list)); BUG_ON(conn->stream.msg); + /* Was purged early during connection release. */ BUG_ON(conn->write_queue); rc = atomic_read(&conn->refcnt); @@ -615,8 +616,8 @@ tfw_peer_for_each_conn(TfwPeer *p, int (*cb)(TfwConn *)) } extern unsigned int tfw_cli_max_concurrent_streams; -extern u64 tfw_cli_soft_mem_limit; -extern u64 tfw_cli_hard_mem_limit; +extern unsigned long tfw_cli_soft_mem_limit; +extern unsigned long tfw_cli_hard_mem_limit; void tfw_connection_unlink_to_sk(TfwConn *conn); void tfw_connection_hooks_register(TfwConnHooks *hooks, int type); diff --git a/fw/http.c b/fw/http.c index d7960ddac..10d267e20 100644 --- a/fw/http.c +++ b/fw/http.c @@ -2907,6 +2907,15 @@ tfw_http_conn_msg_alloc(TfwConn *conn, TfwStream *stream) int delta = PAGE_SIZE << hm->pool->order; hm->pool->owner = cli_mem; + /* + * `tfw_client_mem_get` returns false, if we already + * call `percpu_ref_kill` for `cli_mem` after client + * was freed. We hold the client reference counter + * here due to active connections, so this situation + * is impossible, moreover, false result here means + * memory corruption, since we access `cli_mem` for + * already released client. + */ BUG_ON(!tfw_client_mem_get(cli_mem)); tfw_client_adjust_mem(cli_mem, delta); } @@ -5474,7 +5483,9 @@ tfw_h2_on_send_resp(void *conn, struct sk_buff **skb_head) if (unlikely(!stream)) return -EPIPE; - BUG_ON(stream->xmit.skb_head || stream->xmit.resp); + if (WARN_ON(stream->xmit.skb_head || stream->xmit.resp)) + return -EINVAL; + TFW_SKB_CB(*skb_head)->opaque_data = CLIENT_MEM_FROM_CONN(resp->req->conn); TFW_SKB_CB(*skb_head)->destructor = ss_skb_dflt_destructor; @@ -7458,10 +7469,8 @@ tfw_http_resp_process(TfwConn *conn, TfwStream *stream, struct sk_buff *skb, skb->truesize); r = frang_client_mem_limit(cli_conn, false); - if (unlikely(r)) { - BUG_ON(r != T_BLOCK); + if (unlikely(r)) return tfw_http_resp_filtout(hmresp); - } } else { conn_stop = false; } diff --git a/fw/http2.c b/fw/http2.c index 21c1ab1f6..aedd48780 100644 --- a/fw/http2.c +++ b/fw/http2.c @@ -713,7 +713,7 @@ tfw_h2_stream_xmit_prepare_resp(TfwStream *stream) ALLOW_ERROR_INJECTION(tfw_h2_stream_xmit_prepare_resp, ERRNO); int -tfw_h2_entail_stream_skb(struct sock *sk, TfwH2Ctx *ctx, TfwStream *stream, +tfw_h2_entail_stream_skb(struct sock *sk, TfwStream *stream, unsigned int *len, bool should_split) { unsigned char tls_type = skb_tfw_tls_type(stream->xmit.skb_head); diff --git a/fw/http2.h b/fw/http2.h index e95e97d9d..0c9ac21ec 100644 --- a/fw/http2.h +++ b/fw/http2.h @@ -192,7 +192,7 @@ TfwStream *tfw_h2_find_not_closed_stream(TfwH2Ctx *ctx, unsigned int id, void tfw_h2_req_unlink_stream(TfwHttpReq *req); void tfw_h2_req_unlink_and_close_stream(TfwHttpReq *req); int tfw_h2_stream_xmit_prepare_resp(TfwStream *stream); -int tfw_h2_entail_stream_skb(struct sock *sk, TfwH2Ctx *ctx, TfwStream *stream, +int tfw_h2_entail_stream_skb(struct sock *sk, TfwStream *stream, unsigned int *len, bool should_split); TfwStreamSchedEntry *tfw_h2_alloc_stream_sched_entry(TfwH2Ctx *ctx); void tfw_h2_free_stream_sched_entry(TfwH2Ctx *ctx, TfwStreamSchedEntry *entry); diff --git a/fw/http_frame.c b/fw/http_frame.c index e7d1bf5a8..1988bc583 100644 --- a/fw/http_frame.c +++ b/fw/http_frame.c @@ -2118,7 +2118,7 @@ tfw_h2_insert_frame_header(struct sock *sk, TfwH2Ctx *ctx, TfwStream *stream, /* Send previosly successfully prepared frames if exist. */ stream->xmit.frame_length -= frame_length + FRAME_HEADER_SIZE; if (stream->xmit.frame_length) { - r = tfw_h2_entail_stream_skb(sk, ctx, stream, + r = tfw_h2_entail_stream_skb(sk, stream, &stream->xmit.frame_length, true); } @@ -2141,15 +2141,35 @@ tfw_h2_stream_send_postponed(struct sock *sk, struct sk_buff **skb_head, unsigned int mss_now, unsigned long *snd_wnd) { TfwConn *conn = (TfwConn *)sk->sk_user_data; - int r; - BUG_ON(conn->write_queue); - r = ss_skb_tcp_entail_list(sk, skb_head, mss_now, snd_wnd); - if (unlikely(r)) - return r; + /* + * We send all data from `conn->write_queue` before call + * `tfw_h2_make_frames`. So there is only one case when + * `conn->write_queue can be not empty here - we send + * postponed frames from `HTTP2_SEND_FRAMES` state and + * then call this function again from HTTP2_MAKE_FRAMES_FINISH. + * If `conn->write_queue` is not empty, we should not entail new + * data to socket write queue (it should be sent later after data + * from connection will be sent), just add skb to the end of the + * connection write_queue. + */ + if (likely(!conn->write_queue)) { + int r; + + r = ss_skb_tcp_entail_list(sk, skb_head, mss_now, snd_wnd); + if (unlikely(r)) + return r; + } else { + /* + * Send window was exceeded during previous call of + * `tfw_h2_stream_send_postponed`. + */ + WARN_ON(*snd_wnd); + } ss_skb_queue_splice(&conn->write_queue, skb_head); - sock_set_flag(sk, SOCK_TEMPESTA_HAS_DATA); + if (unlikely(conn->write_queue)) + sock_set_flag(sk, SOCK_TEMPESTA_HAS_DATA); return 0; } @@ -2311,7 +2331,7 @@ do { \ T_FSM_STATE(HTTP2_SEND_FRAMES) { if (likely(stream->xmit.frame_length)) { - r = tfw_h2_entail_stream_skb(sk, ctx, stream, + r = tfw_h2_entail_stream_skb(sk, stream, &stream->xmit.frame_length, false); if (unlikely(r)) { @@ -2347,6 +2367,13 @@ do { \ T_FSM_JMP(HTTP2_MAKE_TRAILER_CONTINUATION_FRAMES); } } else { + /* + * If we there is no headers, data or trailer + * frames to send, all data should be entailed + * to the socket write queue at the beginning + * of this state. + */ + WARN_ON(stream->xmit.frame_length); fallthrough; } } @@ -2361,6 +2388,7 @@ do { \ */ if (unlikely(stream->xmit.skb_head)) { struct sk_buff **head = &stream->xmit.skb_head; + r = tfw_h2_stream_send_postponed(sk, head, mss_now, snd_wnd); @@ -2386,13 +2414,14 @@ do { \ T_FSM_FINISH(r, stream->xmit.state); if (stream->xmit.frame_length) { - r = tfw_h2_entail_stream_skb(sk, ctx, stream, + r = tfw_h2_entail_stream_skb(sk, stream, &stream->xmit.frame_length, true); if (unlikely(r)) { T_WARN("Failed to send frame %d", r); return r; } + WARN_ON(stream->xmit.frame_length); if (unlikely(stream->xmit.postponed) && !ctx->cur_send_headers) { diff --git a/fw/http_limits.c b/fw/http_limits.c index 566de32fd..f7018ecfe 100644 --- a/fw/http_limits.c +++ b/fw/http_limits.c @@ -1678,7 +1678,7 @@ frang_client_mem_limit(TfwCliConn *conn, bool block_if_exceeded) TfwVhost *dflt_vh; if (likely(!tfw_cli_hard_mem_limit - || tfw_client_mem(cli) <= tfw_cli_hard_mem_limit)) + || tfw_client_mem(cli->cli_mem) <= tfw_cli_hard_mem_limit)) return 0; if (!block_if_exceeded) diff --git a/fw/pool.c b/fw/pool.c index ec920fab2..5a52b7d69 100644 --- a/fw/pool.c +++ b/fw/pool.c @@ -270,8 +270,18 @@ __tfw_pool_new(size_t n, TfwClientMem *owner) if (unlikely(!c)) return NULL; - if (cli_mem) + if (cli_mem) { + /* + * `tfw_client_mem_get` returns false, if we already + * call `percpu_ref_kill` for `cli_mem` after client + * was freed. We hold the client reference counter + * here due to active connections, so this situation + * is impossible, moreover, false result here means + * memory corruption, since we access `cli_mem` for + * already released client. + */ BUG_ON(!tfw_client_mem_get(cli_mem)); + } p = (TfwPool *)((char *)c + TFW_POOL_ALIGN_SZ(sizeof(*c))); diff --git a/fw/sock_clnt.c b/fw/sock_clnt.c index 3beb82ddb..cbe41e35d 100644 --- a/fw/sock_clnt.c +++ b/fw/sock_clnt.c @@ -48,8 +48,8 @@ static struct kmem_cache *tfw_h2_conn_cache; static int tfw_cli_cfg_ka_timeout = -1; unsigned int tfw_cli_max_concurrent_streams; -u64 tfw_cli_soft_mem_limit; -u64 tfw_cli_hard_mem_limit; +unsigned long tfw_cli_soft_mem_limit; +unsigned long tfw_cli_hard_mem_limit; static inline struct kmem_cache * tfw_cli_cache(int type) @@ -697,19 +697,6 @@ tfw_cfgop_keepalive_timeout(TfwCfgSpec *cs, TfwCfgEntry *ce) return 0; } -static int -tfw_parse_client_mem(const char *val, unsigned long long *mem) -{ - size_t len = strlen(val); - char *p; - - *mem = memparse(val, &p); - if (p != val + len) - return -EINVAL; - - return 0; -} - static int tfw_cfgop_client_mem(TfwCfgSpec *cs, TfwCfgEntry *ce) { @@ -719,7 +706,7 @@ tfw_cfgop_client_mem(TfwCfgSpec *cs, TfwCfgEntry *ce) TFW_CFG_CHECK_VAL_N(>=, 1, cs, ce); TFW_CFG_CHECK_VAL_N(<, 3, cs, ce); - r = tfw_parse_client_mem(ce->vals[0], &tfw_cli_soft_mem_limit); + r = tfw_cfg_set_mem_val(cs, ce, 0, &tfw_cli_soft_mem_limit); if (unlikely(r)) { T_ERR_NL("Invalid 'client_mem' value: '%s'", ce->vals[0]); @@ -727,20 +714,19 @@ tfw_cfgop_client_mem(TfwCfgSpec *cs, TfwCfgEntry *ce) } if (ce->val_n > 1) { - r = tfw_parse_client_mem(ce->vals[1], &tfw_cli_hard_mem_limit); + r = tfw_cfg_set_mem_val(cs, ce, 1, &tfw_cli_hard_mem_limit); if (unlikely(r)) { T_ERR_NL("Invalid 'client_mem' value: '%s'", ce->vals[1]); return r; } } else { - tfw_cli_hard_mem_limit = (tfw_cli_soft_mem_limit < U64_MAX / 2) ? - tfw_cli_soft_mem_limit * 2 : U64_MAX; + tfw_cli_hard_mem_limit = 2 * tfw_cli_soft_mem_limit; } if (tfw_cli_hard_mem_limit < tfw_cli_soft_mem_limit) { - T_ERR_NL("Invalid 'client_mem' value: hard limit (%llu) is" - " less then soft (%llu)", tfw_cli_hard_mem_limit, + T_ERR_NL("Invalid 'client_mem' value: hard limit (%lu) is" + " less then soft (%lu)", tfw_cli_hard_mem_limit, tfw_cli_soft_mem_limit); return -EINVAL; } @@ -992,12 +978,16 @@ static TfwCfgSpec tfw_sock_clnt_specs[] = { }, { .name = "client_mem", - .deflt = "0 0", + .deflt = "500M 1G", .handler = tfw_cfgop_client_mem, .cleanup = tfw_cfgop_cleanup_sock_clnt, .allow_none = true, .allow_repeat = false, .allow_reconfig = true, + .spec_ext = &(TfwCfgSpecMem) { + .multiple_of = "1K", + .range = { "1K", "100G" }, + } }, { 0 } }; diff --git a/fw/ss_skb.c b/fw/ss_skb.c index e2596fdd9..b6999a180 100644 --- a/fw/ss_skb.c +++ b/fw/ss_skb.c @@ -1342,16 +1342,22 @@ ss_skb_init_for_xmit(struct sk_buff *skb) struct skb_shared_info *shinfo = skb_shinfo(skb); __u8 pfmemalloc = skb->pfmemalloc; + /* + * We call ss_skb_orphan here for several reasons: + * - Currently, we use `skb->cb` to track memory consumed by the skb, + * but this area is reused once the skb is queued to the socket write + * queue. + * - Tracking skb memory usage inside the kernel would require kernel + * patch and adding extra fields to the skb structure. + * - We manage the TCP window for all outgoing data, so skb is only + * queued to the socket write queue when the kernel is already able + * to transmit it. + */ ss_skb_orphan(skb); skb_dst_drop(skb); INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); - /* - * dev is used to save connection for memory accounting - * clear it before pass skb to the kernel. - */ - skb->dev = NULL; /* * Since we use skb->sb for our purpose we should * zeroed it before pass skb to the kernel. @@ -1745,8 +1751,15 @@ ss_skb_dflt_destructor(struct sk_buff *skb) TfwClientMem *cli_mem = (TfwClientMem *)TFW_SKB_CB(skb)->opaque_data; + /* + * If skb is in socket write queue, skb->cb is used to store + * `struct tcp_sock` data. Inside `ss_skb_adjust_client_mem` + * we also access skb->cb, so we corrupt `tcp_sock` data for + * skbs in socket write queue. + */ BUG_ON(skb_tfw_is_in_socket_write_queue(skb)); ss_skb_adjust_client_mem(skb, -TFW_SKB_CB(skb)->mem); + WARN_ON(TFW_SKB_CB(skb)->mem); tfw_client_mem_put(cli_mem); } diff --git a/fw/ss_skb.h b/fw/ss_skb.h index b0febb0c0..a98e80a68 100644 --- a/fw/ss_skb.h +++ b/fw/ss_skb.h @@ -168,6 +168,11 @@ ss_skb_queue_splice(struct sk_buff **skb_head, struct sk_buff **skb) *skb = NULL; } +/* + * Orphan skb from Tempesta-specific ownership. + * We use our own version (instead of using `skb_orphan`) to + * don't use `skb->sk` field inside Tempesta FW source code. + */ static inline void ss_skb_orphan(struct sk_buff *skb) { @@ -178,11 +183,20 @@ ss_skb_orphan(struct sk_buff *skb) destructor = TFW_SKB_CB(skb)->destructor; if (destructor) { + /* + * The same BUG_ON is present in linux kernel in `skb_orphan`. + * `skb->opaque_data` will be used inside destructor, so if it + * is NULL, we still catch BUG later. + */ BUG_ON(!TFW_SKB_CB(skb)->opaque_data); destructor(skb); TFW_SKB_CB(skb)->destructor = NULL; TFW_SKB_CB(skb)->opaque_data = NULL; } else { + /* + * The same BUG_ON is present in linux kernel in + * `skb_orphan`. + */ BUG_ON(TFW_SKB_CB(skb)->opaque_data); } } diff --git a/fw/t/unit/test.c b/fw/t/unit/test.c index a2e444ffb..64772cab3 100644 --- a/fw/t/unit/test.c +++ b/fw/t/unit/test.c @@ -91,6 +91,12 @@ extern TEST_SUITE_MPART_ARR(http2_parser, H2_SUITE_PART_CNT); extern int tfw_pool_init(void); extern void tfw_pool_exit(void); +static inline TfwClientMem * +__tfw_client_mem_from_conn(TfwConn *conn) +{ + return ((TfwClient *)conn_req.peer)->cli_mem; +} + int test_run_all(void) { @@ -120,19 +126,19 @@ test_run_all(void) TEST_SUITE_MPART_RUN(http1_parser); test_req_resp_cleanup(); - EXPECT_EQ(tfw_client_mem((TfwClient *)conn_req.peer), 0); + EXPECT_EQ(tfw_client_mem(__tfw_client_mem_from_conn(&conn_req)), 0); __fpu_schedule(); TEST_SETUP(test_http2_parser_setup_fn); TEST_TEARDOWN(test_http2_parser_teardown_fn); TEST_SUITE_MPART_RUN(http2_parser); - EXPECT_EQ(tfw_client_mem((TfwClient *)conn_req.peer), 0); + EXPECT_EQ(tfw_client_mem(__tfw_client_mem_from_conn(&conn_req)), 0); __fpu_schedule(); TEST_SUITE_RUN(http2_parser_hpack); - EXPECT_EQ(tfw_client_mem((TfwClient *)conn_req.peer), 0); + EXPECT_EQ(tfw_client_mem(__tfw_client_mem_from_conn(&conn_req)), 0); __fpu_schedule(); TEST_SUITE_RUN(http_cache); @@ -143,7 +149,7 @@ test_run_all(void) TEST_SUITE_RUN(http_msg); test_req_resp_cleanup(); - EXPECT_EQ(tfw_client_mem((TfwClient *)conn_req.peer), 0); + EXPECT_EQ(tfw_client_mem(__tfw_client_mem_from_conn(&conn_req)), 0); __fpu_schedule(); TEST_SUITE_RUN(hash); diff --git a/lib/log.h b/lib/log.h index e11e0e623..49a1f9d61 100644 --- a/lib/log.h +++ b/lib/log.h @@ -108,6 +108,13 @@ is_tfw_internal_error_code(int err_code) && err_code < __T_INTERNAL_ERROR_CODE_END; } +/* + * Compare two error codes by severity. + * Lower value is treated as more critical, because all error codes are + * listed in TfwRcCommon enum from the most crucial to the least crucial. + * If more then one error occurs during request/response processing more + * crucial error should be returned. + */ static inline bool tfw_error_code_more_crucial(int err_code1, int err_code2) { @@ -119,6 +126,10 @@ tfw_error_code_more_crucial(int err_code1, int err_code2) return err_code1 < err_code2; } +/* + * Determine if an error is considered "crucial" (i.e., connection should be + * closed). + */ static inline bool tfw_error_code_is_crucial(int err_code) {