Issue 10287 - std.random.uniform is very slow with dmd
Summary: std.random.uniform is very slow with dmd
Status: NEW
Alias: None
Product: D
Classification: Unclassified
Component: dmd (show other issues)
Version: D2
Hardware: x86 Windows
: P4 enhancement
Assignee: No Owner
URL:
Keywords:
Depends on:
Blocks:
 
Reported: 2013-06-06 17:39 UTC by bearophile_hugs
Modified: 2022-12-17 10:42 UTC (History)
0 users

See Also:


Attachments

Note You need to log in before you can comment on or make changes to this issue.
Description bearophile_hugs 2013-06-06 17:39:54 UTC
This program runs about 6 times faster compiled with ldc2 compared to dmd (I run it with 100000000 command line argument):


import core.stdc.stdio, std.random, std.conv;

void main(in string[] args) {
    immutable uint N = (args.length == 2) ?
                       args[1].to!uint :
                       1_000;
    auto rng = Xorshift(0);

    uint total = 0;
    for (uint i = 0; i < N; i++)
        total += uniform(0, 10, rng);

    printf("%u\n", total);
}



If I replace this line:
total += uniform(0, 10, rng);

with:
total += rng.front;  rng.popFront;

Then the code compiled with ldc2 is only about 30% faster.


dmd 2.064alpha, -O -release -inline -noboundscheck.

Ldc2 V. 0.11.0, based on DMD v2.062 and LLVM 3.3svn, -O5 -release -profile-verifier-noassert.
Comment 1 bearophile_hugs 2013-06-06 17:54:27 UTC
This alternative version of the program is handy to produce a more readable asm output:


import core.stdc.stdio: printf;
import core.stdc.stdlib: atoi;
import std.random: Xorshift, uniform;

uint randomSum(in uint n, Xorshift rng) {
    uint total = 0;

    for (uint i = 0; i < n; i++)
        total += uniform(0, 10, rng);

    return total;
}

void main(in string[] args) {
    immutable uint N = (args.length == 2) ?
                       atoi((args[1] ~ '\0').ptr) :
                       1_000;
    auto rng = Xorshift(0);

    immutable result = randomSum(N, rng);

    printf("%u\n", result);
}



LDC2 produces for the randomSum() function (the inner loop is 26 instructions long, with no call instructions):


__D4temp9randomSumFxkS3std6random38__T14XorshiftEngineTkVi128Vi11Vi8Vi19Z14XorshiftEngineZk:
    pushl   %ebp
    pushl   %ebx
    pushl   %edi
    pushl   %esi
    pushl   %eax
    xorl    %eax, %eax
    cmpl    $0, 40(%esp)
    je  LBB0_5
    leal    24(%esp), %ecx
    movl    8(%ecx), %esi
    movl    (%ecx), %eax
    movl    4(%ecx), %edx
    movl    12(%ecx), %ebp
    xorl    %ebx, %ebx
    movl    $0, (%esp)
    .align  16, 0x90
LBB0_2:
    movl    %esi, %edi
    movl    %edx, %ecx
    movl    %ebp, %esi
    movl    %eax, %ebp
    shll    $11, %ebp
    xorl    %eax, %ebp
    movl    %esi, %eax
    shrl    $19, %eax
    xorl    %esi, %eax
    xorl    %ebp, %eax
    shrl    $8, %ebp
    xorl    %eax, %ebp
    cmpl    $-7, %esi
    movl    %edi, %edx
    movl    %ecx, %eax
    ja  LBB0_2
    movl    %esi, %eax
    movl    $671088641, %edx
    mull    %edx
    shrl    $26, %edx
    addl    %edx, (%esp)
    incl    %ebx
    cmpl    40(%esp), %ebx
    movl    %edi, %edx
    movl    %ecx, %eax
    jne LBB0_2
    leal    24(%esp), %eax
    movl    %ebp, 12(%eax)
    movl    %ecx, (%eax)
    movl    %edi, 4(%eax)
    movl    %esi, 8(%eax)
    movl    (%esp), %eax
LBB0_5:
    addl    $4, %esp
    popl    %esi
    popl    %edi
    popl    %ebx
    popl    %ebp
    ret $20


Maybe it's also a matter of missed inlining.