Iterating a vector in C++

Such a simple topic - iterating over a vector, is it even worth discussing?

Interestingly enough, there is a difference in how exactly you iterate - be it using iterators, for(:) sugar or plain old for(i=0; i<vec.size(); ++i).

Let us see what output does a compiler produce in each of these cases.

sample1 (simple for loop):

std::vector<int> data{ 5, 3, 2, 1, 4 };

for (auto i = 0; i < data.size(); ++i) {
  moo(data[i]);
}
        mov     DWORD PTR [rbp-20], 0
        jmp     .L3
.L4:
        mov     eax, DWORD PTR [rbp-20]
        movsx   rdx, eax
        lea     rax, [rbp-64]
        mov     rsi, rdx
        mov     rdi, rax
        call    std::vector<int, std::allocator<int> >::operator[](unsigned long)
        mov     eax, DWORD PTR [rax]
        mov     edi, eax
        call    moo(int)
        add     DWORD PTR [rbp-20], 1
.L3:
        mov     eax, DWORD PTR [rbp-20]
        movsx   rbx, eax
        lea     rax, [rbp-64]
        mov     rdi, rax
        call    std::vector<int, std::allocator<int> >::size() const
        cmp     rbx, rax
        setb    al
        test    al, al
        jne     .L4

but

sample2 (reversed for loop):

for (auto i = data.size() - 1; i >= 0; --i) {
  moo(data[i]);
}
        lea     rax, [rbp-64]
        mov     rdi, rax
        call    std::vector<int, std::allocator<int> >::size() const
        sub     rax, 1
        mov     QWORD PTR [rbp-24], rax
.L3:
        mov     rdx, QWORD PTR [rbp-24]
        lea     rax, [rbp-64]
        mov     rsi, rdx
        mov     rdi, rax
        call    std::vector<int, std::allocator<int> >::operator[](unsigned long)
        mov     eax, DWORD PTR [rax]
        mov     edi, eax
        call    moo(int)
        sub     QWORD PTR [rbp-24], 1
        jmp     .L3

also

sample3 (foreach):

for (auto i : data) {
  moo(i)
}
        lea     rax, [rbp-80]
        mov     QWORD PTR [rbp-24], rax
        mov     rax, QWORD PTR [rbp-24]
        mov     rdi, rax
        call    std::vector<int, std::allocator<int> >::begin()
        mov     QWORD PTR [rbp-88], rax
        mov     rax, QWORD PTR [rbp-24]
        mov     rdi, rax
        call    std::vector<int, std::allocator<int> >::end()
        mov     QWORD PTR [rbp-96], rax
        jmp     .L3
.L4:
        lea     rax, [rbp-88]
        mov     rdi, rax
        call    __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int> > >::operator*() const
        mov     eax, DWORD PTR [rax]
        mov     DWORD PTR [rbp-28], eax
        mov     eax, DWORD PTR [rbp-28]
        mov     edi, eax
        call    moo(int)
        lea     rax, [rbp-88]
        mov     rdi, rax
        call    __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int> > >::operator++()
.L3:
        lea     rdx, [rbp-96]
        lea     rax, [rbp-88]
        mov     rsi, rdx
        mov     rdi, rax
        call    bool __gnu_cxx::operator!=<int*, std::vector<int, std::allocator<int> > >(__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int> > > const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int> > > const&)
        test    al, al
        jne     .L4

but with -O1

sample1 (simple for loop):

        movabs  rax, 12884901893
        movabs  rdx, 4294967298
        mov     QWORD PTR [r12], rax
        mov     QWORD PTR [r12+8], rdx
        mov     DWORD PTR [r12+16], 4
        mov     QWORD PTR [rsp+8], rbp
        mov     rbx, r12
        jmp     .L10
.L20:
        add     rbx, 4
        cmp     rbp, rbx
        je      .L19
.L10:
        mov     edi, DWORD PTR [rbx]
        call    moo(int)
        jmp     .L20

sample2 (reversed for loop):

        movabs  rax, 12884901893
        movabs  rdx, 4294967298
        mov     QWORD PTR [rbp+0], rax
        mov     QWORD PTR [rbp+8], rdx
        mov     DWORD PTR [rbp+16], 4
        lea     rbx, [rbp+16]
        jmp     .L4
.L8:
        sub     rbx, 4
.L4:
        mov     edi, DWORD PTR [rbx]
        call    moo(int)
        jmp     .L8

sample3 (foreach):

        mov     r12, rax
        lea     rbp, [rax+20]
        mov     QWORD PTR [rsp+16], rbp
        movabs  rax, 12884901893
        movabs  rdx, 4294967298
        mov     QWORD PTR [r12], rax
        mov     QWORD PTR [r12+8], rdx
        mov     DWORD PTR [r12+16], 4
        mov     QWORD PTR [rsp+8], rbp
        mov     rbx, r12
        jmp     .L10
.L20:
        add     rbx, 4
        cmp     rbx, rbp
        je      .L19
.L10:
        mov     edi, DWORD PTR [rbx]
        call    moo(int)
        jmp     .L20

Interesting how iterating forwards adds an extra boundary check and a jump (cmd rbx, rbp and je .L19 in samples #1 and #3), whereas iterating backwards does not.

But this find actually must come with one pretty big caveat: the cache lines. The number of assembly instructions is a pretty poor measure of performance - after all, CPU instructions are ridiculously fast, compared to any form of IO - specifically memory access.

The code is also available on Github: [https://github.com/shybovycha/iterate-over-vector-in-cpp/]

The benchmarking results seem to prove the assumption: bacwards iteration seems to be faster:

TestMinMax50%90%95%delta
sample1 (++i)13254473762733133930914168051429724+7%
sample2 (--i)129057125966301308953133495513420080%
sample3 (for(:))14388473034474146069815060011508657+11%

Out of curiosity, I also benchmarked using the postfix operator instead of prefix: i++ (postfix) instead of ++i (prefix), which shows prefix operator is slightly faster:

TestMinMax50%90%95%delta
sample1 prefix (++i)13254473762733133930914168051429724+11%
sample1 postfix (i++)12921954155596130872113705421382136+7%
sample2 prefix (--i)12905712596630130895313349551342008+5%
sample2 postfix (i--)125692925984261267088127777912827870%

Interestingly enough, there is some difference:

  1. backward iteration with postfix decrement for (auto i=vec.size()-1; i>0; i--) is the fastest
  2. backward iteration with prefix decrement --i is
  • 5% slower than i--
  1. forward iteration with postfix increment i++ is
  • 3% slower than --i
  • 7% slower than i--
  1. forward iteration with prefix increment ++i is the slowest:
  • 11% slower than i--
  • 7% slower than --i
  • 4% slower than i++

The above tests were ran 10000 times on a heap-allocated (std::vector) 10000 int elements.

Standard library iterators

Out of sheer curiosity, I decided to test how using an iterator would affect the results (sample4):

for (auto it = a.begin(); it != a.end(); ++it) {
  moo(*it);
}

With both prefix and suffix (postfix) variations:

for (auto it = a.begin(); it != a.end(); it++) {
  moo(*it);
}

And the reverse iterator (sample5):

for (auto it = a.end(); it != a.begin(); it--) {
  moo(*it);
}
TestMinMax50%90%95%delta
sample1 prefix (++i)13254473762733133930914168051429724+11%
sample1 postfix (i++)12921954155596130872113705421382136+7%
sample2 prefix (--i)12905712596630130895313349551342008+4%
sample2 postfix (i--)125692925984261267088127777912827870%
sample3 (for(:))14388473034474146069815060011508657+17%
sample4 prefix (++it)14705832834386149531015610231564850+22%
sample4 postfix (it++)14616342695552153590515478651556815+21%
sample5 prefix (--it)14621562802714147056714789571512621+18%
sample5 postfix (it--)14479202831859147133414796701490088+16%

The results are a bit surprising, looking at the assembly code generated by G++:

.L25:
        cmp     rbx, r13
        je      .L10
        mov     rbp, r13
        jmp     .L11
.L27:
        add     rbp, 4
        cmp     rbx, rbp
        je      .L26
.L11:
        mov     edi, DWORD PTR [rbp+0]
        call    moo(int)
        jmp     .L27
.L26:
        test    r13, r13
        je      .L18
        mov     rsi, r12
        sub     rsi, r13
.L15:
        mov     rdi, r13
        call    operator delete(void*, unsigned long)
        ; ... cleanup & exit ...
        ret

Clang 17.0.1 is a slightly different story:

.LBB0_1:
        cmp     r14, rax
        jne     .LBB0_2
        sub     r14, r15
        movabs  rax, 9223372036854775804
        cmp     r14, rax
        mov     qword ptr [rsp], r15
        je      .LBB0_11
        mov     rbp, r14
        sar     rbp, 2
        cmp     rbp, 1
        mov     rax, rbp
        adc     rax, 0
        lea     rdx, [rax + rbp]
        mov     rcx, r12
        cmp     rdx, r12
        jbe     .LBB0_14
        add     rax, rbp
        jae     .LBB0_16
.LBB0_17:
        test    r12, r12
        je      .LBB0_18
.LBB0_19:
        lea     rdi, [4*r12]
        call    operator new(unsigned long)@PLT
        mov     r15, rax
        jmp     .LBB0_21
.LBB0_14:
        mov     rcx, rdx
        add     rax, rbp
        jb      .LBB0_17
.LBB0_16:
        mov     r12, rcx
        test    r12, r12
        jne     .LBB0_19
.LBB0_18:
        xor     r15d, r15d
.LBB0_21:
        mov     dword ptr [r15 + 4*rbp], r13d
        test    r14, r14
        mov     rbx, qword ptr [rsp]
        jle     .LBB0_23
        mov     rdi, r15
        mov     rsi, rbx
        mov     rdx, r14
        call    memmove@PLT
.LBB0_23:
        test    rbx, rbx
        je      .LBB0_25
        mov     rdi, rbx
        call    operator delete(void*)@PLT
.LBB0_25:
        lea     rbp, [r15 + 4*rbp]
        lea     rax, [r15 + 4*r12]
        movabs  r12, 2305843009213693951
        jmp     .LBB0_26
.LBB0_3:
        cmp     r15, r14
        je      .LBB0_7
        lea     r14, [r15 - 4]
.LBB0_5:
        mov     edi, dword ptr [r14 + 4]
        call    moo(int)@PLT
        add     r14, 4
        cmp     r14, rbp
        jne     .LBB0_5
.LBB0_7:
        test    r15, r15
        je      .LBB0_9
        mov     rdi, r15
        call    operator delete(void*)@PLT
.LBB0_9:
        xor     eax, eax
        add     rsp, 8
        pop     rbx
        pop     r12
        pop     r13
        pop     r14
        pop     r15
        pop     rbp
        ret
.LBB0_11:
        lea     rdi, [rip + .L.str.1]
        call    std::__throw_length_error(char const*)@PLT
        jmp     .LBB0_30
        jmp     .LBB0_30
        mov     qword ptr [rsp], r15
.LBB0_30:
        mov     r14, rax
        cmp     qword ptr [rsp], 0
        je      .LBB0_32
        mov     rdi, qword ptr [rsp]
        call    operator delete(void*)@PLT
.LBB0_32:
        mov     rdi, r14
        call    _Unwind_Resume@PLT

Caching impact

One other assumption is that iterating over a vector backwards affects the memory caching in a pretty poor manner. This is a rather complex scenario to test, since there are two potential scenarios on how this could happen:

  1. swap memory usage - happens when the dataset is so big it does not fit in the available RAM and OS has to switch to using disk instead of RAM; this is the worst-case scenario
  2. data locality / CPU cache usage - CPUs usually try to predict memory access patterns and pre-load more data than actually required by a given operation; when the data block is accessed at random points (indices), CPU fails to pre-load the correct slices of memory

Since this blog is about iterating over a vector, it would be seem to be easier to generate absurdely large chunks of data and try to iterate over them. But in reality it would only happen with absurdely large files - my machine, for instance, sports 32GB of RAM, so it would be quite a task to generate multiple 32GB files.

Simulating random memory access will break the purpose of iterating over the elements one by one - in these cases we either use iterators to access a linked list (each element is in random part of RAM) or access elements by index. And there is no way to iterate over them by incrementing an index. Alternatively, we have to dereference indexes from another block of memory. Either way it has nothing to do comparing for loops.

Simulating accessing multiple cache lines could actually be easier - we just need to replace integers with structures of variable size and use std::list (linked list) instead of std::vector (single block of memory):

struct MyStruct {
  bool f0; // 1 byte
  float f1[100]; // sizeof(float) * 100 = 4 * 100 = 400 bytes
  double f2[53]; // sizeof(double) * 53 = 8 * 53 = 424 bytes
  char f3[64]; // sizeof(char) * 64 = 1 * 64 = 64 bytes
}; // total size = 889 bytes, likely to span across multiple cache lines

std::list<MyStruct> a;

and generate a large enough number of these objects:

#include <random>

std::random_device rd;
std::mt19937_64 gen(rd());

std::uniform_int_distribution<int> bool_dist(0, 1);
std::uniform_real_distribution<float> float_dist(0.0f, 1000.0f);
std::uniform_int_distribution<int> int_dist(std::numeric_limits<int>::min(), std::numeric_limits<int>::max());

for (auto i = 0; i < 10000; ++i) {
  MyStruct e;

  e.f0 = static_cast<bool>(bool_dist(gen));

  for (auto t = 0; t < 100; ++t)
    e.f1[t] = float_dist(gen);

  for (auto t = 0; t < 53; ++t)
    e.f2[t] = static_cast<double>(float_dist(gen));

  for (auto t = 0; t < 64; ++t)
    e.f3[t] = static_cast<char>(int_dist(gen) % 255);

  a.push_back(e);
}

Just to prevent memory leaks, cleanup the test data at the end:

for (auto& item : a) {
  delete item.f3;
}

a.clear();

These benchmarks take significantly longer time to run and produce big numbers:

TestMinMax50%90%95%delta
sample1 (++i)7882057153855697948012800435580316730%
sample2 (--i)791587016297070897129391094939151567+13%
sample3 (for(:))809489416496526815900482291388259364+3%

Compare this to a structure which actually fits in one cache line. Cache line size is apparently 64 B - that is sixty-four bytes - on most CPUs (AMD Zen, Intel Ice Lake). My Apple M1 laptop has twice as much, 128 B, as shown by running sysctl -a | grep 'hw.cachelinesize'.

If the structure was reorganized to fit in that limit, like so:

struct MyStruct {
  bool f0; // 1 byte
  float* f1; // sizeof(float*) = 8 bytes
  double* f2; // sizeof(double*) = 8 bytes
  char* f3; // sizeof(char*) = 8 bytes
}; // total size = 25 bytes, likely to fir in a single cache line

The timings are considerably lower for the backwards iteration:

TestMinMax50%90%95%delta
sample1 (++i)7575860200051857640651770005777259330%
sample2 (--i)746681618068148765741581095318208603+6%
sample3 (for(:))783272414264212788293079322957960718+3%

This highlights how predictive cache loading and data element size that fits into a single CPU cache line actually impacts RAM reads - by a significant margin.

Conclusion

In simplest case (vector of numbers), the worst case scenario is within 11% difference, with backwards iteration being the fastest, followed by forward iteration and foreach being the slowest of the bunch. Prefix decrement in the backwards iteration being the fastest. Bear in mind: these numbers are in nanoseconds, meaning the worst case scenario is 10000 (ten thousand) elements of variable size being iterated in 1.5 ms and the fastest being 1.3 ms - that is milliseconds. For comparison, rendering a frame at 120 FPS rate would take about 8.3 ms. And if you were to process some requests (in a client-server system), you would be able to handle 699 requests per second while iterating over this list. So realistically, the way you iterate a vector of integers, any of these techniques would only really matter if you are working on a really high performance system.

That being said, this is all true while the vector elements fit in one CPU cache line or cache block, that is L1 and L2 caches.

So if the data element does not fit in a block of 64 B (or 128 B on some CPUs), the performance impact of backwards iteration is going to be more significant. In this case we are talking about 8 ms vs 9.1 ms. In terms of FPS, the difference would be 125 FPS vs 109 FPS. Or about 15 requests per second more.