sieve.txt 5-Oct-2015 This is how sieve.xpl gets compiled with xpl0, the optimizing code generator (iilo), and the scheduler (sch). Some editing was done to the actual output file (sieve.s): Explanatory comments were added, and unnecessary code definition comments were deleted. This version of sieve.xpl is slightly different than the version shown for the non-optimizing compiler (xplr). The only substantive difference is that the dimension of the Flags array is in the declaration rather than being set up using the Reserve intrinsic. This enables use of r6. r11 = base address of local procedure variables (not used here) r10 = base address of heap (where all variables and arrays reside) r9 = heap pointer r8 = base address of runtime support code variables .include "natrpi.s" the runtime code gets assembled .text over and over - probably will .align 2 get linked in someday progrm: compiled program's entry point @ \sieve.xpl gcc uses "@" for comments @ \Eratosthenes Sieve Prime Number Program @ define Size = 8190; @ integer Prime, Count, Iter, I, K; mov r1, #0x000000FF @0B IMM 2 instructions set r1 = 8190+1 orr r1, #0x00001F00 push {r1} Flags' dimension is declared @EC STK mov r2, #1 @0B IMM it's a 1-dimensional array push {r2} @EC STK push {r2} with 1 byte in each element @EC STK mov r3, #28 @0B IMM offset to the "Flags" pointer push {r3} @F6 REGARY @ char Flags(Size+1); str r9, [r8, #baseary-dseg] @F5 BASEARY mov r12, r14 overhead for mkarray subroutine add r9, #0x000020 @09 HPI heap space used by 6 variables bl mkarray @48 ARY allocate space for array and set mov r6, r0 Flags and r6 to point to it ldr r0, [r8, #base0-dseg] @F4 BASE most of this is actually not push {r0, r11, r12} used at this nesting level 0 ldr r11, [r8, #baseary-dseg] str r11, [r8, #base0-dseg] base0 is not used at level 0 @ED CLEAN @EC STK mov r1, #0 @0B IMM device 0 = console push {r1} .data .align 2 L1: .ascii "10000 iterationó" last character has its MSB set .text .align 2 add r0, r8, #(L1-dseg)&0xFF @0B IMM point to message string add r0, #(L1-dseg)&0xFF00 add r0, #(L1-dseg)&0xFF0000 required for large programs bl intr12 @0C CML call Text intrinsic @ED CLEAN mov r0, #0 @0B IMM device 0 = console bl intr9 @0C CML do carriage return & linefeed @ [Text(0, "10000 iterations"); CrLf(0); mov r1, #1 @0B IMM Iter:= 1 str r1, [r10, #16] @03 STO @E8 FORSET mov r7, r1 r7:= Iter mov r2, #0x00000710 @0B IMM push loop limit (10000) on stack orr r2, #0x00002000 push {r2} b K2 @07 JMP jump to bottom of 'for' loop K3: mov r1, #0 @0B IMM Count:= 0 str r1, [r10, #12] @03 STO str r1, [r10, #20] @03 STO I:= 0 @E8 FORSET str r7, [r10, #16] unneeded but it's not inside loop mov r7, r1 r7:= I mov r2, #0x000000FE @0B IMM push loop limit (8190) on stack orr r2, #0x00001F00 push {r2} b K4 @07 JMP jump to bottom of 'for' loop K5:@ for Iter:= 1 to 10000 do \do program 10000 times @ [Count:= 0; \initialize prime counter @ for I:= 0 to Size do mvn r1, #0 @0B IMM r1:= true; (not 0 = $FFFF_FFFF) strb r1, [r6, r7] @E5 STX2 Flags(I):= r1 add r7, #1 @19 INP increment I's value K4: @18 FOR ldr r0, [sp] @ ***** get loop limit (2 cycle latency) str r7, [r10, #20] save I while waiting - it could cmp r7, r0 be used later ble K5 loop until limit is exceeded ldr r7, [r10, #16] @ ***** r7:=Iter; moved to avoid latency add sp, #4 discard Size limit on stack mov r1, #0 @0B IMM I:= 0 str r1, [r10, #20] @03 STO save I (redundant) @E8 FORSET str r7, [r10, #16] save Iter (redundant) mov r7, r1 r7:= I mov r2, #0x000000FE @0B IMM push loop limit (8190) on stack orr r2, #0x00001F00 push {r2} b K6 @07 JMP jump to bottom of 'for' loop K7:@ Flags(I):= true; \set Flags all true @ for I:= 0 to Size do ldrb r0, [r6, r7] @02 LDX if Flags(I) then cmp r0, #0 @08 JPC beq K9 add r0, r7, r7 @0D ADD Prime:= I+I+3 add r0, r0, #3 @0D ADD str r0, [r10, #8] @03 STO add r0, r7, r0 @0D ADD K:= I+Prime str r0, [r10, #24] @03 STO K10: @12..17 JOC start time-critical loop ldr r1, [r10, #24] @ ***** fetch K now to avoid latency mov r2, #0x000000FE @0B IMM while K<= Size do orr r2, #0x00001F00 subs r14, r1, r2 bgt K12 ldr r4, [r10, #8] @ ***** fetch Prime now to avoid latency @ if Flags(I) then \found a prime @ [Prime:= I+I+3; \twice the index + 3 @ K:= I+Prime; \first multiple to kill @ while K <= Size do mov r3, #0 @0B IMM Flags(K):= false strb r3, [r6, r1] @E5 STX2 add r0, r1, r4 @0D ADD K:= Prime+K str r0, [r10, #24] @03 STO b K10 @07 JMP back to time-critical loop K12:@ [Flags(K):= false; \zero a non-prime ldr r1, [r10, #12] @ ***** @ K:= K+Prime; \next multiple of Prime @ ]; add r0, r1, #1 @0D ADD Count++ str r0, [r10, #12] @03 STO K9: add r7, #1 @19 INP I++ K6: @18 FOR ldr r0, [sp] @ ***** check 'for' loop limit str r7, [r10, #20] save I in case it's used later cmp r7, r0 ble K7 ldr r7, [r10, #16] @ ***** reload r7 with Iter add sp, #4 discard I's loop limit @ Count:= Count+1; \primes found @ ]; add r7, #1 @19 INP Iter++ K2: @18 FOR ldr r0, [sp] @ ***** check Iter's loop limit str r7, [r10, #16] cmp r7, r0 ble K3 ldr r0, [r10, #12] @ ***** add sp, #4 discard Iter's limit @ ]; @ED CLEAN @EC STK mov r1, #0 @0B IMM device 0 = console push {r1} bl intr11 @0C CML call IntOut intrinsic @ED CLEAN @EC STK mov r1, #0 @0B IMM device 0 = console push {r1} .data .align 2 L13: .ascii " primes\015Š" .text .align 2 add r0, r8, #(L13-dseg)&0xFF @0B IMM add r0, #(L13-dseg)&0xFF00 add r0, #(L13-dseg)&0xFF0000 bl intr12 @0C CML call Text intrinsic @ IntOut(0, Count); Text(0, " primes @ "); \primes found in 10000th pass @ ] pop {r11} @06 RET not needed at level 0 ldr r9, [r8, #base0-dseg] str r11, [r8, #base0-dseg] pop {r11, pc} storing lr into pc does return .bss .align 4 heaplo: .skip 0x4000000 allow up to 64 MB of heap space .end This shows how the non-optimizing compiler (xplr) generates code for sieve.xpl. \SIEVE.XPL \Eratosthenes Sieve Prime Number Program code RESERVE=3, CRLF=9, INTOUT=11, TEXT=12; define SIZE = 8190; integer PRIME, COUNT, ITER, I, K; char FLAGS; begin @ .include runtime.s Not used yet .text specify memory for code .align 2 must be on 4-byte boundary progrm: ldr r0, [r8, #base0-dseg] save base0 (not actually needed) push {r0, lr} str r9, [r8, #base0-dseg] set base0 to heap pointer add r9, #32 @09 HPI reserve heap space for variables FLAGS:= RESERVE(SIZE+1); mov r0, #0xFE @0B IMM 2 instructions to make constant orr r0, #0x1F00 push {r0} stack-oriented design uses lots mov r0, #0x01 @0B IMM of extra instructions pop {r1} @0D ADD (optimized version is coming) add r0, r1 bl intr3 @0C CML call intrinsic code 3 (Reserve) str r0, [r10, #28] @03 STO store array address in FLAGS TEXT(0, "10000 iterations"); CrLf(0); .data .align 2 L1: .ascii "10000 iterationó" last char has its MSB set .text .align 2 mov r0, #0x00 @0B IMM device 0 = console push {r0} add r0, r8, #(L1-dseg)&0xFF @0B IMM point to string add r0, #(L1-dseg)&0xFF00 bl intr12 @0C CML call TEXT instrinsic mov r0, #0x00 @0B IMM call CrLf intrinsic bl intr9 @0C CML for ITER:= 1, 10000 do \do program 10000 times mov r0, #0x01 @0B IMM ITER:= 1 str r0, [r10, #16] @03 STO mov r0, #0x10 @0B IMM loop limit is on the stack orr r0, #0x2700 push {r0} b L2 @07 JMP test limit immediately begin COUNT:= 0; \prime counter L3: mov r0, #0x00 @0B IMM str r0, [r10, #12] @03 STO for I:= 0, SIZE do mov r0, #0x00 @0B IMM str r0, [r10, #20] @03 STO mov r0, #0xFE @0B IMM orr r0, #0x1F00 push {r0} b L4 @07 JMP FLAGS(I):= true; \set flags all true L5: ldr r0, [r10, #28] @01 LOD FLAGS push {r0} ldr r0, [r10, #20] @01 LOD I pop {r1} @0D ADD add r0, r1 push {r0} mvn r0, #0x00 @0B IMM not 0 = 0xFFFFFFFF = true pop {r1} @04 STX strb r0, [r1] store just the low byte ldr r0, [r10, #20] @19 INP bump 'for' loop counter I add r0, #1 str r0, [r10, #20] L4: ldr r0, [r10, #20] @18 FOR check if limit reached ldr r1, [sp] cmp r0, r1 ble L5 branch back if not add sp, #4 pop limit value off stack for I:= 0, SIZE do mov r0, #0x00 @0B IMM str r0, [r10, #20] @03 STO mov r0, #0xFE @0B IMM orr r0, #0x1F00 push {r0} b L6 @07 JMP if FLAGS(I) then \found a prime L7: ldr r0, [r10, #28] @01 LOD push {r0} ldr r0, [r10, #20] @01 LOD pop {r1} @02 LDX ldrb r0, [r0, r1] cmp r0, #0 beq L8 @08 JPC begin PRIME:= I +I +3; \twice the index + 3 ldr r0, [r10, #20] @01 LOD push {r0} ldr r0, [r10, #20] @01 LOD doesn't know r0 already has I pop {r1} @0D ADD (but optimized version will) add r0, r1 push {r0} mov r0, #0x03 @0B IMM pop {r1} @0D ADD add r0, r1 str r0, [r10, #8] @03 STO K:= I + PRIME; \first multiple to kill ldr r0, [r10, #20] @01 LOD push {r0} ldr r0, [r10, #8] @01 LOD pop {r1} @0D ADD add r0, r1 str r0, [r10, #24] @03 STO while K <= SIZE do L9: ldr r0, [r10, #24] @01 LOD push {r0} mov r0, #0xFE @0B IMM orr r0, #0x1F00 pop {r1} @1_ CMP comparison puts true or false cmp r1, r0 value on the stack mov r0, #0 suble r0, #1 cmp r0, #0 now it finally uses the compare beq L10 @08 JPC begin FLAGS(K):= false; \zero a non-prime ldr r0, [r10, #28] @01 LOD push {r0} ldr r0, [r10, #24] @01 LOD pop {r1} @0D ADD add r0, r1 push {r0} mov r0, #0x00 @0B IMM pop {r1} @04 STX strb r0, [r1] K:= K +PRIME; \next multiple ldr r0, [r10, #24] @01 LOD push {r0} ldr r0, [r10, #8] @01 LOD pop {r1} @0D ADD add r0, r1 str r0, [r10, #24] @03 STO b L9 @07 JMP end; COUNT:= COUNT +1; \primes found L10: ldr r0, [r10, #12] @01 LOD push {r0} mov r0, #0x01 @0B IMM pop {r1} @0D ADD add r0, r1 str r0, [r10, #12] @03 STO L8: ldr r0, [r10, #20] @19 INP for loop I variable add r0, #1 str r0, [r10, #20] L6: ldr r0, [r10, #20] @18 FOR ldr r1, [sp] cmp r0, r1 ble L7 add sp, #4 end; ldr r0, [r10, #16] @19 INP for loop ITER variable add r0, #1 str r0, [r10, #16] L2: ldr r0, [r10, #16] @18 FOR ldr r1, [sp] cmp r0, r1 ble L3 add sp, #4 end; INTOUT(0, COUNT); TEXT(0, " primes"); \primes found in 10th pass mov r0, #0x00 @0B IMM push {r0} ldr r0, [r10, #12] @01 LOD bl intr11 @0C CML .data .align 2 L11: .ascii " primeó" .text .align 2 mov r0, #0x00 @0B IMM push {r0} add r0, r8, #(L11-dseg)&0xFF @0B IMM add r0, #(L11-dseg)&0xFF00 bl intr12 @0C CML CrLf(0); mov r0, #0x00 @0B IMM bl intr9 @0C CML end; ldr sp, [r8, #stkptr-dseg] @00 EXIT force return to runtime code pop {pc} .bss uninitialized data memory .align 2 heaplo: .skip 0x1000000 reserve 16 MB for XPL arrays etc .end