slub分配器的理解

slub分配器的原理，这里已经分析的很清楚。

这里主要记录我对这个的理解，怕以后忘记。

关键数据结构

struct kmem_cache

这个是slub分配器的总管，如果要针对特定大小，特定用途的内存有一个分配器的话，那就必须要创建这个结构。而每次分配内存的时候，需要把这个结构传递给分配器。

struct kmem_cache {
#ifndef CONFIG_SLUB_TINY
        struct kmem_cache_cpu __percpu *cpu_slab;
#endif
        //..
        struct kmem_cache_node *node[MAX_NUMNODES];
};

cpu_slab 是每个cpu都有自己的copy，这样每个cpu首先要在自己的cache里面进行分配，在这个cache pool里面分配是不需要锁的，毕竟这个cache pool只给这个CPU分配。这样会大大提高效率。

node 这个变量跟NUMA有关系，在分配内存的时候，肯定是希望尽可能分配的内存使这个CPU访问的效率最高。所以当cpu_slab没法满足分配的时，尽可能的到它对应的node里面去找slab来进行分配。也许在这个node里面别的CPU已经释放了一些slab，这样这个CPU就可以直接利用了，也不需要重新到伙伴系统里面去分配。

这也就是这两个结构里面都会有一些slab的原因。

struct kmem_cache_cpu

struct kmem_cache_cpu {
        union {
                struct {
                        void **freelist;        /* Pointer to next available object */
                        unsigned long tid;      /* Globally unique transaction id */
                };
                freelist_aba_t freelist_tid;
        };
        struct slab *slab;      /* The slab from which we are allocating */
#ifdef CONFIG_SLUB_CPU_PARTIAL
        struct slab *partial;   /* Partially allocated slabs */
#endif
        //...
};

这里出现了一个freelist，这个是指向第一个free object的对应的指针。这里把一个slab所对应的内存划分成很多个object，这些object就是最终要分配给上层软件的内存空间。在没有分配之前，这里会利用这个object所在的内存，存放next 指针，这个next 指针存放在这个object什么地方，由 struct kmem_cache的offset 来决定。这个offset一般放在这个object的中间，防止踩内存的出现，参考函数： calculate_sizes

/*
 * Store freelist pointer near middle of object to keep
 * it away from the edges of the object to avoid small
 * sized over/underflows from neighboring allocations.
 */
s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));

tid在这里的作用是同步任务用的，如果当前任务分配内存分配到一半，被切换出去了，而前面计划分配给这个任务的内存，可能被别的内务给分配了，那这个时候就需要重新给这个任务分配。参考函数： __slab_alloc_node：

static __always_inline void *__slab_alloc_node(struct kmem_cache *s,
                gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
{
        //...
redo:
        //获取当前CPU的slab信息
        c = raw_cpu_ptr(s->cpu_slab);
        tid = READ_ONCE(c->tid);

        if (!USE_LOCKLESS_FAST_PATH() ||
            unlikely(!object || !slab || !node_match(slab, node))) {
            //Slow path里面会disable 任务迁移到别的CPU里面去
                object = __slab_alloc(s, gfpflags, node, addr, c, orig_size);
        } else {
                //如果发现这个tid和c->tid不一样，直接fail了重新做
                        if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) {
                                note_cmpxchg_failure("slab_alloc", s, tid);
                                goto redo;
                        }
            //..
    }
}

static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
                          unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
{
        void *p;

#ifdef CONFIG_PREEMPT_COUNT
        /*
         * We may have been preempted and rescheduled on a different
         * cpu before disabling preemption. Need to reload cpu area
         * pointer.
         */
        c = slub_get_cpu_ptr(s->cpu_slab);  //disable 任务迁移到别的CPU
#endif

        p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size);
#ifdef CONFIG_PREEMPT_COUNT
        slub_put_cpu_ptr(s->cpu_slab);
#endif
        return p;
}

字段slab 对应的就是当前被分配的slab，每次alloc 内存的时候，如果slab为空(第一次使用），或者freelist 为空（slab没有空闲的object)，那就需要先分配一个新的slab。

至于如何分配新的slab，这个首先要到partial列表里面去找对应的slab，如果还找不到，就到struct kmem_cache_node 的partial列表去找，如果还找不到，就到buddy里面去分配一个新的slab。

partial 就是一个slab的列表，因为系统内存会分配和释放同时进行，这样就会出现多个slab，部分有分配出去，部分是空闲的，这也就是这个字段partial的来源。

struct kmem_cache_node

这个结构主要是维护统一个numa node里面被分配过的slab列表。当这个CPU的slab全都被分配出去之后，就可以考虑从这个node里面去找对应的slab，也许别的CPU释放了很多内存，最终被释放到numa node里面，这样就不需要到buddy里面去分配。

struct kmem_cache_node {
        spinlock_t list_lock;
        unsigned long nr_partial;
        struct list_head partial;
        //....
};

struct kmem_cache {
        //..
        struct kmem_cache_node *node[MAX_NUMNODES];
};

kmem_cache 里面的node 是一个指针，所以也需要分配内存，这个也是一个专门的cache pool里面来分配的，具体看下面。

管理数据的创建

在执行完mm_core_init -> mem_init -> memblock_free_all 之后，buddy系统就已经准备好了，可以进行以page为单位的内存分配。

mm_core_init -> kmem_cache_init 就是创建struct kmem_cache 管理数据啦，在这之后就可以调用kmalloc来进行小块内存的分配和释放。

系统有：

struct kmem_cache *kmem_cache;
static struct kmem_cache *kmem_cache_node;

系统里面可能会有很多的struct kmem_cache 和struct kmem_cache_node ，所以也需要为他们准备一个pool。这两个pool就是为他们准备的，它们的初始化稍微有一点点不一样，可以参考函数 kmem_cache_init：

void __init kmem_cache_init(void)
{
        static __initdata struct kmem_cache boot_kmem_cache,
                boot_kmem_cache_node;
        int node;

        //不是很清楚为啥这里使用静态变量，感觉stack 变量应该也可以，后面在bootstrap的时候会copy到动态分配的变量里
        kmem_cache_node = &boot_kmem_cache_node;
        kmem_cache = &boot_kmem_cache;

        /*
         * Initialize the nodemask for which we will allocate per node
         * structures. Here we don't need taking slab_mutex yet.
         */
        for_each_node_state(node, N_NORMAL_MEMORY)
                node_set(node, slab_nodes);

        //第一步就是创建struct kmem_cache_node的pool， 这个时候对应的struct kmem_cache->node就不能使用这个cache来分配，
        //而是要使用early_kmem_cache_node_alloc来分配，参看函数：init_kmem_cache_nodes
        create_boot_cache(kmem_cache_node, "kmem_cache_node",
                sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0);

        hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);

        /* Able to allocate the per node structures */
        slab_state = PARTIAL;

        //第二步就是创建struct kmem_cache的pool，这个时候可以使用上面那个pool来分配这个对应的struct kmem_cache_node，这也就是上面那句slab_state = PARTIAL的原因
        //这个大小就是根据系统有多少node就分配多少，不需要分配所有的node的指针
        create_boot_cache(kmem_cache, "kmem_cache",
                        offsetof(struct kmem_cache, node) +
                                nr_node_ids * sizeof(struct kmem_cache_node *),
                       SLAB_HWCACHE_ALIGN, 0, 0);

        //前面不是使用静态变量来存放struct kmem_cache，这里重新在kmem_cache这个pool里面分配这个结构体，
        //再把想对应的信息复制过去，然后 struct kmem_cache boot_kmem_cache, boot_kmem_cache_node; 使命就完成
        kmem_cache = bootstrap(&boot_kmem_cache);
        kmem_cache_node = bootstrap(&boot_kmem_cache_node);

        //第三步，有了前面两步，那现在就可以直接使用上面两个pool来创建比的kmem_cache
        //下面这里就是创建一下基本大小的pool，如8，16，32等等
        /* Now we can use the kmem_cache to allocate kmalloc slabs */
        setup_kmalloc_cache_index_table();
        create_kmalloc_caches();

        //到这里之后，就可以开始使用kmalloc来分配内存了

所有这些数据结构的创建过程，最终都是调用 __kmem_cache_create ，这个函数只是初始化一些管理的信息，而不会为这个创建的cache pool分配任何的slab。分配slab这个动作都是在分配内存的时候才会去做。

但是因为这里需要用到struct kmem_cache 和struct kmem_cache_node，所有可能会在kmem_cache 和kmem_cache_node 这两个pool里面分配内存，这两个pool也就有可能会分配slab。这个上面那个原则不矛盾。也就是使用pool来分配内存的时候才会分配对应的slab。

在create_kmalloc_cache 中会分配结构struct kmem_cache，在init_kmem_cache_nodes 中分分配结构struct kmem_cache_node。所以这两个函数都会为这两个pool分配slab

分配

最终的分配都要到函数slab_alloc_node -> __slab_alloc_node:

static __always_inline void *__slab_alloc_node(struct kmem_cache *s,
                gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
{
        struct kmem_cache_cpu *c;
        struct slab *slab;
        unsigned long tid;
        void *object;

redo:
        /*
         * Must read kmem_cache cpu data via this cpu ptr. Preemption is
         * enabled. We may switch back and forth between cpus while
         * reading from one cpu area. That does not matter as long
         * as we end up on the original cpu again when doing the cmpxchg.
         *
         * We must guarantee that tid and kmem_cache_cpu are retrieved on the
         * same cpu. We read first the kmem_cache_cpu pointer and use it to read
         * the tid. If we are preempted and switched to another cpu between the
         * two reads, it's OK as the two are still associated with the same cpu
         * and cmpxchg later will validate the cpu.
         */
        c = raw_cpu_ptr(s->cpu_slab);
        tid = READ_ONCE(c->tid);

        /*
         * Irqless object alloc/free algorithm used here depends on sequence
         * of fetching cpu_slab's data. tid should be fetched before anything
         * on c to guarantee that object and slab associated with previous tid
         * won't be used with current tid. If we fetch tid first, object and
         * slab could be one associated with next tid and our alloc/free
         * request will be failed. In this case, we will retry. So, no problem.
         */
        barrier();

        /*
         * The transaction ids are globally unique per cpu and per operation on
         * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
         * occurs on the right processor and that there was no operation on the
         * linked list in between.
         */

        object = c->freelist;
        slab = c->slab;

        if (!USE_LOCKLESS_FAST_PATH() ||
            unlikely(!object || !slab || !node_match(slab, node))) {
                object = __slab_alloc(s, gfpflags, node, addr, c, orig_size);
        } else {
                void *next_object = get_freepointer_safe(s, object);

                /*
                 * The cmpxchg will only match if there was no additional
                 * operation and if we are on the right processor.
                 *
                 * The cmpxchg does the following atomically (without lock
                 * semantics!)
                 * 1. Relocate first pointer to the current per cpu area.
                 * 2. Verify that tid and freelist have not been changed
                 * 3. If they were not changed replace tid and freelist
                 *
                 * Since this is without lock semantics the protection is only
                 * against code executing on this cpu *not* from access by
                 * other cpus.
                 */
                if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) {
                        note_cmpxchg_failure("slab_alloc", s, tid);
                        goto redo;
                }
                prefetch_freepointer(s, next_object);
                stat(s, ALLOC_FASTPATH);
        }

        return object;
}

那个redo的原因，前面也解释过了，注释也是比较清楚的。在slow path的时候，函数__slab_alloc 中会禁止把任务调度到别的CPU上去。

static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
                          unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
{
        void *p;

#ifdef CONFIG_PREEMPT_COUNT
        /*
         * We may have been preempted and rescheduled on a different
         * cpu before disabling preemption. Need to reload cpu area
         * pointer.
         */
        c = slub_get_cpu_ptr(s->cpu_slab); //禁止把任务调度到别的CPU上去
#endif

        p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size);
#ifdef CONFIG_PREEMPT_COUNT
        slub_put_cpu_ptr(s->cpu_slab);
#endif
        return p;
}

在slow path的___slab_alloc里面，逻辑稍微复杂一点，其实就是什么时候在CPU slab里面直接分配，什么时候去CPU partial里面分配，什么时候去node里面分配，什么时候去buddy里面去分配。

释放

释放最终都会调用到函数： slab_free -> do_slab_free ,如果free 的object 是在当前CPU的slab上，则直接设置kmem_cache_cpu 的freelist,如果不在当前CPU的slab上，则调用__slab_free。

第一步，更新当前slab的信息，就是把要free的object 放到slab的freelist 里来。

do {
        if (unlikely(n)) {
                spin_unlock_irqrestore(&n->list_lock, flags);
                n = NULL;
        }
        prior = slab->freelist;
        counters = slab->counters;
        set_freepointer(s, tail, prior);
        new.counters = counters;
        was_frozen = new.frozen;
        new.inuse -= cnt;
        if ((!new.inuse || !prior) && !was_frozen) {
                /* Needs to be taken off a list */
                if (!kmem_cache_has_cpu_partial(s) || prior) {

                        n = get_node(s, slab_nid(slab));
                        /*
                         * Speculatively acquire the list_lock.
                         * If the cmpxchg does not succeed then we may
                         * drop the list_lock without any processing.
                         *
                         * Otherwise the list_lock will synchronize with
                         * other processors updating the list of slabs.
                         */
                        spin_lock_irqsave(&n->list_lock, flags);

                        on_node_partial = slab_test_node_partial(slab);
                }
        }

} while (!slab_update_freelist(s, slab,
        prior, counters,
        head, new.counters,
        "__slab_free"));

第二步，如果free之前，这个slab是完整被分配出去的，就把这个slab 挂到CPU 的partial list 里来，就结束。

if (likely(!n)) {

        if (likely(was_frozen)) {
                /*
                 * The list lock was not taken therefore no list
                 * activity can be necessary.
                 */
                stat(s, FREE_FROZEN);
        } else if (kmem_cache_has_cpu_partial(s) && !prior) {
                /*
                 * If we started with a full slab then put it onto the
                 * per cpu partial list.
                 */
                put_cpu_partial(s, slab, 1);
                stat(s, CPU_PARTIAL_FREE);
        }

        return;
}

第三步，如果free 之后，这个slab变成了所有objects都是free状态的情况，这个时候就考虑这个node 里面的partial slab 的数量是不是大于最小值，大于就调用discard_slab回收这个slab。

        if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
                goto slab_empty;

        //...

slab_empty:
        if (prior) {
                /*
                 * Slab on the partial list.
                 */
                remove_partial(n, slab);
                stat(s, FREE_REMOVE_PARTIAL);
        }

        spin_unlock_irqrestore(&n->list_lock, flags);
        stat(s, FREE_SLAB);
        discard_slab(s, slab);

第四步，如果free之前，这个slab是完整被分配出去的，在free 一个object 之后，就可以把他加回node 的partial list：

/*
 * Objects left in the slab. If it was not on the partial list before
 * then add it.
 */
if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
        add_partial(n, slab, DEACTIVATE_TO_TAIL);
        stat(s, FREE_ADD_PARTIAL);
}
spin_unlock_irqrestore(&n->list_lock, flags);
return;

comments

kmem_cache 作为一个基本的 allocator 组件，这里可以把他理解为用于某个特定大小的内存对象池。

Linux 系统中本身也存在很多这样的内存 …

comments

Linux学习笔记

Other articles

kmem_cache内存池使用