If you define THR
, the code will do the same job but just in another thread. I only measured the time spent on the write
call.
Running the code with ./some-file-name>/dev/null
, this is the result I get, which is the accumulated clock cycles.
THR
not defined
1 48930106
2 43946464
3 44669126
4 45918011
5 44108477
6 43608789
7 45104427
8 49676889
9 44682305
10 47516931
THR
defined
1 108347418
2 101670307
3 101726085
4 100531554
5 100137343
6 85837022
7 105556754
8 104681843
9 110303338
10 104666783
Why is write
when called from another thread so much slower?
The system is Fedora Linux.
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include <pthread.h>
#include <semaphore.h>
#include <fcntl.h>
#include <unistd.h>
#include <immintrin.h>
#ifdef __cplusplus
#include <atomic>
using namespace std;
#else
#include <stdatomic.h>
#endif
#define SIZE 0x100000
static unsigned long long rdtscp() {
unsigned _;
return __rdtscp(&_);
}
static char b[SIZE];
static atomic_ullong oc;
#ifdef THR
static sem_t s[2];
void *out(void *_) {
for (;;) {
sem_wait(s);
unsigned long long c = rdtscp();
write(1, b, SIZE);
oc += rdtscp() - c;
sem_post(s + 1);
}
return _;
}
#endif
int main() {
memset(b, 'a', SIZE);
#ifdef THR
sem_init(s, false, 0);
sem_init(s + 1, false, 0);
pthread_t t;
pthread_create(&t, NULL, out, NULL);
#endif
for (int i = 1;; ++i) {
#ifdef THR
sem_post(s);
sem_wait(s + 1);
#else
unsigned long long c = rdtscp();
write(1, b, SIZE);
oc += rdtscp() - c;
#endif
const int d = 100000;
if (!(i % d)) {
unsigned long long _oc = atomic_exchange(&oc, 0);
fprintf(stderr, "%4d%12llu\n", i / d, _oc);
}
}
}
Not sure if this is okay, but I made the code both compile in C and C++ to add the C++ tag. I will roll back if this is inappropriate.