Issue 9991 - Optimizer Doesn't Enregister Static Array Elements With Compile-Time Constant Index
Summary: Optimizer Doesn't Enregister Static Array Elements With Compile-Time Constant...
Status: NEW
Alias: None
Product: D
Classification: Unclassified
Component: dmd (show other issues)
Version: D2
Hardware: All All
: P4 enhancement
Assignee: No Owner
URL:
Keywords: performance
Depends on:
Blocks:
 
Reported: 2013-04-25 09:34 UTC by David Simcha
Modified: 2022-12-17 10:42 UTC (History)
3 users (show)

See Also:


Attachments

Note You need to log in before you can comment on or make changes to this issue.
Description David Simcha 2013-04-25 09:34:00 UTC
Can't static array elements where the index is known at compile time be treated just like regular stack variables?  GDC seems to treat them as such.

D Source code below.  I'd expect these two implementations to compile to the exact same ASM.  In GDC, the main loops do, though the pre-loop setup stuff does compile differently for reasons I don't understand.

int sum1(const int[] arr) {
  auto end = arr.ptr + arr.length - 3;
  int[4] sums;
  for(auto cur = arr.ptr; cur < end; cur += 4) {
    sums[0] += cur[0];
    sums[1] += cur[1];
    sums[2] += cur[2];
    sums[3] += cur[3];
  }

  return sums[0] + sums[1] + sums[2] + sums[3];
}

int sum2(const int[] arr) {
  auto end = arr.ptr + arr.length - 3;
  int sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0;
  for(auto cur = arr.ptr; cur < end; cur += 4) {
    sum0 += cur[0];
    sum1 += cur[1];
    sum2 += cur[2];
    sum3 += cur[3];
  }

  return sum0 + sum1 + sum2 + sum3;
}

ASM:

_D5test34sum1FxAiZi:
		push	RBP
		mov	RBP,RSP
		sub	RSP,020h
		push	RBX
		lea	RAX,-020h[RBP]
		xor	RCX,RCX
		mov	[RAX],RCX
		mov	8[RAX],RCX
		mov	R8,RSI
		mov	RDX,RSI
		mov	R9,RDI
		lea	R9,0FFFFFFF4h[R9*4][RDX]
		cmp	RDX,R9
		jae	L51
L2D:		mov	EBX,[R8]
		add	-020h[RBP],EBX
		mov	ESI,4[R8]
		add	-01Ch[RBP],ESI
		mov	EAX,8[R8]
		add	-018h[RBP],EAX
		mov	ECX,0Ch[R8]
		add	-014h[RBP],ECX
		add	R8,010h
		cmp	R8,R9
		jb	L2D
L51:		mov	EAX,-020h[RBP]
		add	EAX,-01Ch[RBP]
		add	EAX,-018h[RBP]
		add	EAX,-014h[RBP]
		pop	RBX
		mov	RSP,RBP
		pop	RBP
		ret
		nop

_D5test34sum2FxAiZi:
		push	RBP
		mov	RBP,RSP
		sub	RSP,010h
		push	RBX
		push	R12
		push	R13
		xor	R9D,R9D
		xor	R11D,R11D
		xor	EBX,EBX
		xor	R12D,R12D
		mov	R8,RSI
		mov	RAX,RSI
		mov	R13,RDI
		lea	R13,0FFFFFFF4h[R13*4][RAX]
		cmp	RAX,R13
		jae	L46
L2E:		add	R9D,[R8]
		add	R11D,4[R8]
		add	EBX,8[R8]
		add	R12D,0Ch[R8]
		add	R8,010h
		cmp	R8,R13
		jb	L2E
L46:		lea	EAX,[R11][R9]
		add	EAX,EBX
		add	EAX,R12D
		pop	R13
		pop	R12
		pop	RBX
		mov	RSP,RBP
		pop	RBP
		ret
Comment 1 Walter Bright 2013-04-25 12:41:56 UTC
The optimizer currently does not enregister variables that don't fit into registers. It could be enhanced to do it.
Comment 2 timon.gehr 2013-04-25 12:48:21 UTC
(In reply to comment #1)
> The optimizer currently does not enregister variables that don't fit into
> registers. It could be enhanced to do it.

ints definitely fit into registers.
Comment 3 Walter Bright 2013-04-25 13:44:14 UTC
(In reply to comment #2)
> ints definitely fit into registers.

sums is one variable, not 4.
Comment 4 bearophile_hugs 2013-06-14 03:16:06 UTC
(In reply to comment #0)

> In GDC, the main loops do, though the pre-loop setup stuff
> does compile differently for reasons I don't understand.

ldc2 compiles them to the same asm (-release -profile-verifier-noassert -O5 -output-s):

__D4test4sum1FxAiZi:
	pushl	%ebx
	pushl	%edi
	pushl	%esi
	movl	16(%esp), %eax
	movl	20(%esp), %esi
	leal	-12(%esi,%eax,4), %ebx
	xorl	%ecx, %ecx
	xorl	%edx, %edx
	xorl	%edi, %edi
	xorl	%eax, %eax
	jmp	LBB0_2
	.align	16, 0x90
LBB0_1:
	addl	12(%esi), %eax
	addl	8(%esi), %edi
	addl	4(%esi), %edx
	addl	(%esi), %ecx
	addl	$16, %esi
LBB0_2:
	cmpl	%ebx, %esi
	jb	LBB0_1
	addl	%edi, %eax
	addl	%edx, %eax
	addl	%ecx, %eax
	popl	%esi
	popl	%edi
	popl	%ebx
	ret	$8


__D4test4sum2FxAiZi:
	pushl	%ebx
	pushl	%edi
	pushl	%esi
	movl	16(%esp), %eax
	movl	20(%esp), %esi
	leal	-12(%esi,%eax,4), %ebx
	xorl	%ecx, %ecx
	xorl	%edx, %edx
	xorl	%edi, %edi
	xorl	%eax, %eax
	jmp	LBB1_2
	.align	16, 0x90
LBB1_1:
	addl	12(%esi), %ecx
	addl	8(%esi), %edx
	addl	4(%esi), %edi
	addl	(%esi), %eax
	addl	$16, %esi
LBB1_2:
	cmpl	%ebx, %esi
	jb	LBB1_1
	addl	%edi, %eax
	addl	%edx, %eax
	addl	%ecx, %eax
	popl	%esi
	popl	%edi
	popl	%ebx
	ret	$8
Comment 5 Vladimir Panteleev 2017-07-18 13:21:55 UTC
FWIW, after https://github.com/dlang/dmd/pull/6176 (2.073.0) the generated assembly code changed, however a simple benchmark shows that performance did not noticeably change.