NumPy integration gcc experiments

Note: Everywhere below where it says "buf" it should be "data".

Access strategies

While different plain-array access methods (C-style, Fortran-style) can be added, this document will be about generic multidimensionsal access using strides and slices, which I expect will be the common choice because of user convenience and no buffer copying without the user knowing about it.

So, one has code like this in Cython:

arr[i, j]

and the question is if this can through a normal Python operator overload generate code to look up a NumPy array using strides and still be as efficient as possible.

The question then is: Is it possible to put the stride access and division within a C array access function, and hope that GCC inlining deals with it?

The answer: No, unfortunately not ... at least gcc -O3 didn't deal with it. Experiment below.

Note: This really makes sense and is the expected answer, because you cannot know whether buf and strides overlaps compile-time.

Possible workarounds

The experiment

void test1(ndarray arr) {
  int di = arr.strides[0], dj = arr.strides[1];
  int i, j;
  for (i = 0; i != 100; ++i) {
    for (j = 0; j != 100; ++j) {
        *((int*)(arr.buf + i*di + j*dj)) = i*j;
    }
  }
}
===
.L3:
        movslq  %ecx,%rax
        addl    $1, %r8d
        addl    %r11d, %ecx
        movl    %edx, (%rax,%r10)
        addl    %r9d, %edx
        cmpl    $100, %r8d
        jne     .L3

void test2(ndarray arr) {
  int di = arr.strides[0] / sizeof(int), dj = arr.strides[1] / sizeof(int);
  int i, j;
  int* buf = (int*)arr.buf;
  for (i = 0; i != 100; ++i) {
    for (j = 0; j != 100; ++j) {
        buf[i*di + j*dj] = i*j;
    }
  }
}
===
.L12:
        movslq  %esi,%rax
        addl    $1, %ecx
        addl    %r9d, %esi
        movl    %edx, (%rdi,%rax,4)
        addl    %r8d, %edx
        cmpl    $100, %ecx
        jne     .L12

void test3(ndarray arr) {
  int i, j;
  int* buf = (int*)arr.buf;
  for (i = 0; i != 100; ++i) {
    for (j = 0; j != 100; ++j) {
      *((int*)(arr.buf + i * arr.strides[0] + j * arr.strides[1])) = i*j;
    }
  }
}
===
.L20:
        movl    %ecx, %eax
        movl    %r9d, %edx
        addl    $1, %ecx
        imull   (%rsi), %edx
        imull   (%r11), %eax
        movslq  %edx,%rdx
        cltq
        addq    %rax, %rdx
        movl    %r8d, (%rdx,%rdi)
        addl    %r10d, %r8d
        cmpl    $100, %ecx
        jne     .L20

Code: test.c

enhancements/numpy/gcc (last edited 2009-03-01 01:02:24 by localhost)