Iterating a vector in C++
Such a simple topic - iterating over a vector, is it even worth discussing?
Interestingly enough, there is a difference in how exactly you iterate - be it using iterators, for(:) sugar or plain old for(i=0; i<vec.size(); ++i).
Let us see what output does a compiler produce in each of these cases.
sample1 (simple for loop):
std::vector<int> data{ 5, 3, 2, 1, 4 };
for (auto i = 0; i < data.size(); ++i) {
moo(data[i]);
}
mov DWORD PTR [rbp-20], 0
jmp .L3
.L4:
mov eax, DWORD PTR [rbp-20]
movsx rdx, eax
lea rax, [rbp-64]
mov rsi, rdx
mov rdi, rax
call std::vector<int, std::allocator<int> >::operator[](unsigned long)
mov eax, DWORD PTR [rax]
mov edi, eax
call moo(int)
add DWORD PTR [rbp-20], 1
.L3:
mov eax, DWORD PTR [rbp-20]
movsx rbx, eax
lea rax, [rbp-64]
mov rdi, rax
call std::vector<int, std::allocator<int> >::size() const
cmp rbx, rax
setb al
test al, al
jne .L4
but
sample2 (reversed for loop):
for (auto i = data.size() - 1; i >= 0; --i) {
moo(data[i]);
}
lea rax, [rbp-64]
mov rdi, rax
call std::vector<int, std::allocator<int> >::size() const
sub rax, 1
mov QWORD PTR [rbp-24], rax
.L3:
mov rdx, QWORD PTR [rbp-24]
lea rax, [rbp-64]
mov rsi, rdx
mov rdi, rax
call std::vector<int, std::allocator<int> >::operator[](unsigned long)
mov eax, DWORD PTR [rax]
mov edi, eax
call moo(int)
sub QWORD PTR [rbp-24], 1
jmp .L3
also
sample3 (foreach):
for (auto i : data) {
moo(i)
}
lea rax, [rbp-80]
mov QWORD PTR [rbp-24], rax
mov rax, QWORD PTR [rbp-24]
mov rdi, rax
call std::vector<int, std::allocator<int> >::begin()
mov QWORD PTR [rbp-88], rax
mov rax, QWORD PTR [rbp-24]
mov rdi, rax
call std::vector<int, std::allocator<int> >::end()
mov QWORD PTR [rbp-96], rax
jmp .L3
.L4:
lea rax, [rbp-88]
mov rdi, rax
call __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int> > >::operator*() const
mov eax, DWORD PTR [rax]
mov DWORD PTR [rbp-28], eax
mov eax, DWORD PTR [rbp-28]
mov edi, eax
call moo(int)
lea rax, [rbp-88]
mov rdi, rax
call __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int> > >::operator++()
.L3:
lea rdx, [rbp-96]
lea rax, [rbp-88]
mov rsi, rdx
mov rdi, rax
call bool __gnu_cxx::operator!=<int*, std::vector<int, std::allocator<int> > >(__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int> > > const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int> > > const&)
test al, al
jne .L4
but with -O1
sample1 (simple for loop):
movabs rax, 12884901893
movabs rdx, 4294967298
mov QWORD PTR [r12], rax
mov QWORD PTR [r12+8], rdx
mov DWORD PTR [r12+16], 4
mov QWORD PTR [rsp+8], rbp
mov rbx, r12
jmp .L10
.L20:
add rbx, 4
cmp rbp, rbx
je .L19
.L10:
mov edi, DWORD PTR [rbx]
call moo(int)
jmp .L20
sample2 (reversed for loop):
movabs rax, 12884901893
movabs rdx, 4294967298
mov QWORD PTR [rbp+0], rax
mov QWORD PTR [rbp+8], rdx
mov DWORD PTR [rbp+16], 4
lea rbx, [rbp+16]
jmp .L4
.L8:
sub rbx, 4
.L4:
mov edi, DWORD PTR [rbx]
call moo(int)
jmp .L8
sample3 (foreach):
mov r12, rax
lea rbp, [rax+20]
mov QWORD PTR [rsp+16], rbp
movabs rax, 12884901893
movabs rdx, 4294967298
mov QWORD PTR [r12], rax
mov QWORD PTR [r12+8], rdx
mov DWORD PTR [r12+16], 4
mov QWORD PTR [rsp+8], rbp
mov rbx, r12
jmp .L10
.L20:
add rbx, 4
cmp rbx, rbp
je .L19
.L10:
mov edi, DWORD PTR [rbx]
call moo(int)
jmp .L20
Interesting how iterating forwards adds an extra boundary check and a jump (cmd rbx, rbp and je .L19 in samples #1 and #3),
whereas iterating backwards does not.
But this find actually must come with one pretty big caveat: the cache lines. The number of assembly instructions is a pretty poor measure of performance - after all, CPU instructions are ridiculously fast, compared to any form of IO - specifically memory access.
The code is also available on Github: [https://github.com/shybovycha/iterate-over-vector-in-cpp/]
The benchmarking results seem to prove the assumption: bacwards iteration seems to be faster:
| Test | Min | Max | 50% | 90% | 95% | delta |
|---|---|---|---|---|---|---|
sample1 (++i) | 1325447 | 3762733 | 1339309 | 1416805 | 1429724 | +7% |
sample2 (--i) | 1290571 | 2596630 | 1308953 | 1334955 | 1342008 | 0% |
sample3 (for(:)) | 1438847 | 3034474 | 1460698 | 1506001 | 1508657 | +11% |
Out of curiosity, I also benchmarked using the postfix operator instead of prefix: i++ (postfix) instead of ++i (prefix), which shows prefix operator is slightly faster:
| Test | Min | Max | 50% | 90% | 95% | delta |
|---|---|---|---|---|---|---|
sample1 prefix (++i) | 1325447 | 3762733 | 1339309 | 1416805 | 1429724 | +11% |
sample1 postfix (i++) | 1292195 | 4155596 | 1308721 | 1370542 | 1382136 | +7% |
sample2 prefix (--i) | 1290571 | 2596630 | 1308953 | 1334955 | 1342008 | +5% |
sample2 postfix (i--) | 1256929 | 2598426 | 1267088 | 1277779 | 1282787 | 0% |
Interestingly enough, there is some difference:
- backward iteration with postfix decrement
for (auto i=vec.size()-1; i>0; i--)is the fastest - backward iteration with prefix decrement
--iis
5%slower thani--
- forward iteration with postfix increment
i++is
3%slower than--i7%slower thani--
- forward iteration with prefix increment
++iis the slowest:
11%slower thani--7%slower than--i4%slower thani++
The above tests were ran 10000 times on a heap-allocated (std::vector) 10000 int elements.
Standard library iterators
Out of sheer curiosity, I decided to test how using an iterator would affect the results (sample4):
for (auto it = a.begin(); it != a.end(); ++it) {
moo(*it);
}
With both prefix and suffix (postfix) variations:
for (auto it = a.begin(); it != a.end(); it++) {
moo(*it);
}
And the reverse iterator (sample5):
for (auto it = a.end(); it != a.begin(); it--) {
moo(*it);
}
| Test | Min | Max | 50% | 90% | 95% | delta |
|---|---|---|---|---|---|---|
sample1 prefix (++i) | 1325447 | 3762733 | 1339309 | 1416805 | 1429724 | +11% |
sample1 postfix (i++) | 1292195 | 4155596 | 1308721 | 1370542 | 1382136 | +7% |
sample2 prefix (--i) | 1290571 | 2596630 | 1308953 | 1334955 | 1342008 | +4% |
sample2 postfix (i--) | 1256929 | 2598426 | 1267088 | 1277779 | 1282787 | 0% |
sample3 (for(:)) | 1438847 | 3034474 | 1460698 | 1506001 | 1508657 | +17% |
sample4 prefix (++it) | 1470583 | 2834386 | 1495310 | 1561023 | 1564850 | +22% |
sample4 postfix (it++) | 1461634 | 2695552 | 1535905 | 1547865 | 1556815 | +21% |
sample5 prefix (--it) | 1462156 | 2802714 | 1470567 | 1478957 | 1512621 | +18% |
sample5 postfix (it--) | 1447920 | 2831859 | 1471334 | 1479670 | 1490088 | +16% |
The results are a bit surprising, looking at the assembly code generated by G++:
.L25:
cmp rbx, r13
je .L10
mov rbp, r13
jmp .L11
.L27:
add rbp, 4
cmp rbx, rbp
je .L26
.L11:
mov edi, DWORD PTR [rbp+0]
call moo(int)
jmp .L27
.L26:
test r13, r13
je .L18
mov rsi, r12
sub rsi, r13
.L15:
mov rdi, r13
call operator delete(void*, unsigned long)
; ... cleanup & exit ...
ret
Clang 17.0.1 is a slightly different story:
.LBB0_1:
cmp r14, rax
jne .LBB0_2
sub r14, r15
movabs rax, 9223372036854775804
cmp r14, rax
mov qword ptr [rsp], r15
je .LBB0_11
mov rbp, r14
sar rbp, 2
cmp rbp, 1
mov rax, rbp
adc rax, 0
lea rdx, [rax + rbp]
mov rcx, r12
cmp rdx, r12
jbe .LBB0_14
add rax, rbp
jae .LBB0_16
.LBB0_17:
test r12, r12
je .LBB0_18
.LBB0_19:
lea rdi, [4*r12]
call operator new(unsigned long)@PLT
mov r15, rax
jmp .LBB0_21
.LBB0_14:
mov rcx, rdx
add rax, rbp
jb .LBB0_17
.LBB0_16:
mov r12, rcx
test r12, r12
jne .LBB0_19
.LBB0_18:
xor r15d, r15d
.LBB0_21:
mov dword ptr [r15 + 4*rbp], r13d
test r14, r14
mov rbx, qword ptr [rsp]
jle .LBB0_23
mov rdi, r15
mov rsi, rbx
mov rdx, r14
call memmove@PLT
.LBB0_23:
test rbx, rbx
je .LBB0_25
mov rdi, rbx
call operator delete(void*)@PLT
.LBB0_25:
lea rbp, [r15 + 4*rbp]
lea rax, [r15 + 4*r12]
movabs r12, 2305843009213693951
jmp .LBB0_26
.LBB0_3:
cmp r15, r14
je .LBB0_7
lea r14, [r15 - 4]
.LBB0_5:
mov edi, dword ptr [r14 + 4]
call moo(int)@PLT
add r14, 4
cmp r14, rbp
jne .LBB0_5
.LBB0_7:
test r15, r15
je .LBB0_9
mov rdi, r15
call operator delete(void*)@PLT
.LBB0_9:
xor eax, eax
add rsp, 8
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
ret
.LBB0_11:
lea rdi, [rip + .L.str.1]
call std::__throw_length_error(char const*)@PLT
jmp .LBB0_30
jmp .LBB0_30
mov qword ptr [rsp], r15
.LBB0_30:
mov r14, rax
cmp qword ptr [rsp], 0
je .LBB0_32
mov rdi, qword ptr [rsp]
call operator delete(void*)@PLT
.LBB0_32:
mov rdi, r14
call _Unwind_Resume@PLT
Caching impact
One other assumption is that iterating over a vector backwards affects the memory caching in a pretty poor manner. This is a rather complex scenario to test, since there are two potential scenarios on how this could happen:
- swap memory usage - happens when the dataset is so big it does not fit in the available RAM and OS has to switch to using disk instead of RAM; this is the worst-case scenario
- data locality / CPU cache usage - CPUs usually try to predict memory access patterns and pre-load more data than actually required by a given operation; when the data block is accessed at random points (indices), CPU fails to pre-load the correct slices of memory
Since this blog is about iterating over a vector, it would be seem to be easier to generate absurdely large chunks of data and try to iterate over them. But in reality it would only happen with absurdely large files - my machine, for instance, sports 32GB of RAM, so it would be quite a task to generate multiple 32GB files.
Simulating random memory access will break the purpose of iterating over the elements one by one - in these cases we either use iterators to access a linked list (each element is in random part of RAM) or access elements by index. And there is no way to iterate over them by incrementing an index.
Alternatively, we have to dereference indexes from another block of memory. Either way it has nothing to do comparing for loops.
Simulating accessing multiple cache lines could actually be easier - we just need to replace integers with structures of variable size and use std::list (linked list) instead of std::vector (single block of memory):
struct MyStruct {
bool f0; // 1 byte
float f1[100]; // sizeof(float) * 100 = 4 * 100 = 400 bytes
double f2[53]; // sizeof(double) * 53 = 8 * 53 = 424 bytes
char f3[64]; // sizeof(char) * 64 = 1 * 64 = 64 bytes
}; // total size = 889 bytes, likely to span across multiple cache lines
std::list<MyStruct> a;
and generate a large enough number of these objects:
#include <random>
std::random_device rd;
std::mt19937_64 gen(rd());
std::uniform_int_distribution<int> bool_dist(0, 1);
std::uniform_real_distribution<float> float_dist(0.0f, 1000.0f);
std::uniform_int_distribution<int> int_dist(std::numeric_limits<int>::min(), std::numeric_limits<int>::max());
for (auto i = 0; i < 10000; ++i) {
MyStruct e;
e.f0 = static_cast<bool>(bool_dist(gen));
for (auto t = 0; t < 100; ++t)
e.f1[t] = float_dist(gen);
for (auto t = 0; t < 53; ++t)
e.f2[t] = static_cast<double>(float_dist(gen));
for (auto t = 0; t < 64; ++t)
e.f3[t] = static_cast<char>(int_dist(gen) % 255);
a.push_back(e);
}
Just to prevent memory leaks, cleanup the test data at the end:
for (auto& item : a) {
delete item.f3;
}
a.clear();
These benchmarks take significantly longer time to run and produce big numbers:
| Test | Min | Max | 50% | 90% | 95% | delta |
|---|---|---|---|---|---|---|
sample1 (++i) | 7882057 | 15385569 | 7948012 | 8004355 | 8031673 | 0% |
sample2 (--i) | 7915870 | 16297070 | 8971293 | 9109493 | 9151567 | +13% |
sample3 (for(:)) | 8094894 | 16496526 | 8159004 | 8229138 | 8259364 | +3% |
Compare this to a structure which actually fits in one cache line.
Cache line size is apparently 64 B - that is sixty-four bytes - on most CPUs (AMD Zen, Intel Ice Lake).
My Apple M1 laptop has twice as much, 128 B, as shown by running sysctl -a | grep 'hw.cachelinesize'.
If the structure was reorganized to fit in that limit, like so:
struct MyStruct {
bool f0; // 1 byte
float* f1; // sizeof(float*) = 8 bytes
double* f2; // sizeof(double*) = 8 bytes
char* f3; // sizeof(char*) = 8 bytes
}; // total size = 25 bytes, likely to fir in a single cache line
The timings are considerably lower for the backwards iteration:
| Test | Min | Max | 50% | 90% | 95% | delta |
|---|---|---|---|---|---|---|
sample1 (++i) | 7575860 | 20005185 | 7640651 | 7700057 | 7725933 | 0% |
sample2 (--i) | 7466816 | 18068148 | 7657415 | 8109531 | 8208603 | +6% |
sample3 (for(:)) | 7832724 | 14264212 | 7882930 | 7932295 | 7960718 | +3% |
This highlights how predictive cache loading and data element size that fits into a single CPU cache line actually impacts RAM reads - by a significant margin.
Conclusion
In simplest case (vector of numbers), the worst case scenario is within 11% difference, with backwards iteration being the fastest, followed by forward iteration and foreach being the slowest of the bunch.
Prefix decrement in the backwards iteration being the fastest.
Bear in mind: these numbers are in nanoseconds, meaning the worst case scenario is 10000 (ten thousand) elements of variable size being iterated in 1.5 ms and the fastest being 1.3 ms - that is milliseconds.
For comparison, rendering a frame at 120 FPS rate would take about 8.3 ms. And if you were to process some requests (in a client-server system), you would be able to handle 699 requests per second while iterating over this list.
So realistically, the way you iterate a vector of integers, any of these techniques would only really matter if you are working on a really high performance system.
That being said, this is all true while the vector elements fit in one CPU cache line or cache block, that is L1 and L2 caches.
So if the data element does not fit in a block of 64 B (or 128 B on some CPUs), the performance impact of backwards iteration is going to be more significant.
In this case we are talking about 8 ms vs 9.1 ms. In terms of FPS, the difference would be 125 FPS vs 109 FPS. Or about 15 requests per second more.