# HG changeset patch # User Petr Sumbera # Date 1490780079 25200 # Node ID 89b7621146f80ba1635974f0d8fe1c4e7d828089 # Parent 85e8894d1fc696e5ff39e025ec28f712b3b79d75 25799643 zlib performance regression on sparc after 25706942 diff -r 85e8894d1fc6 -r 89b7621146f8 components/zlib/Makefile --- a/components/zlib/Makefile Wed Mar 29 16:47:08 2017 -0700 +++ b/components/zlib/Makefile Wed Mar 29 02:34:39 2017 -0700 @@ -109,7 +109,7 @@ COMPONENT_SYSTEM_TEST_TARGETS= test SYSTEM_TEST=1 -CLEAN_PATHS += $(PROTO_DIR) capabilities/*/*/*.o capabilities/*/*/*.s +CLEAN_PATHS += $(PROTO_DIR) capabilities/*/*/*.o capabilities/*/$(MACH32)/*.s system-test: build $(SYSTEM_TEST_32_and_64) diff -r 85e8894d1fc6 -r 89b7621146f8 components/zlib/capabilities/sun4v/sparcv9/Makefile --- a/components/zlib/capabilities/sun4v/sparcv9/Makefile Wed Mar 29 16:47:08 2017 -0700 +++ b/components/zlib/capabilities/sun4v/sparcv9/Makefile Wed Mar 29 02:34:39 2017 -0700 @@ -26,19 +26,14 @@ include ../Makefile.com include ../../Makefile.com -CFLAGS += -m64 -xarch=sparc4 -xtarget=T4 -xchip=T4 -xO5 +CFLAGS += -m64 -xarch=sparc CPPFLAGS += -D__sparc ASFLAGS = -m64 -K PIC -xarch=sparc4 include ../Makefile.targ include ../../Makefile.targ -DEFLATE_C=../../../build/$(BUILD_ARCH)/deflate.c - -longest_match_t4.s: $(DEFLATE_C) - $(CC) $(CFLAGS) -DLONGEST_MATCH_ONLY -DORIG_LONGEST_MATCH_GLOBAL -S -o $@ $(DEFLATE_C) - all build: $(SYMCAP) clean: - $(RM) *.o *.s + $(RM) *.o diff -r 85e8894d1fc6 -r 89b7621146f8 components/zlib/capabilities/sun4v/sparcv9/longest_match_t4.s --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/components/zlib/capabilities/sun4v/sparcv9/longest_match_t4.s Wed Mar 29 02:34:39 2017 -0700 @@ -0,0 +1,279 @@ +! +! This file was generated by a compiler that is currently not part of the CBE +! (as the CBE compiler does not generate code for the T4 architecture), and +! then it was modified by hand to remove some unnecessary instructions that +! the compiler generated and the main loop's branches was rearranged for +! fewer taken branches on the most frequent code path. These modifications +! were made in 7 steps. In each step, a few lines were removed from and added +! to the compiler generated code to produce an equivalent binary. The lines +! that were removed in step are marked by "!" at the beginning of the +! line, the lines added in this step are marked by the same added at the end of +! the line. In other words, let C_i mean the code, after step (C_0 is +! the original, compiler generated code, C_7 is the code in this file) +! To reproduce C_i (0 <= i < 7) first take C_, remove the lines that +! end in !, and then remove the ! string from the beginning of those +! lines that start with it. Comparing C_i and C_ is a simple task, as +! only a few lines have changed. +! If a compiler (e.g. the Oracle Studio 12.3) becomes part of the CBE and +! it will be able to generate as efficient code as in this file the +! longest_match.o file can simply be comp[iled from longest_match.c . +! + + .section ".text",#alloc,#execinstr,#progbits + .file "deflate-t4.c" + + .section ".bss",#alloc,#write,#nobits + +Bbss.bss: + + .section ".data",#alloc,#write,#progbits + +Ddata.data: + + .section ".rodata",#alloc,#progbits +! +! CONSTANT POOL +! + +Drodata.rodata: + + .section ".picdata",#alloc,#write + +Dpicdata.picdata: + + .section ".tbss",#alloc,#write,#tls,#nobits + +Ttbss.bss: + + .section ".tdata",#alloc,#write,#tls,#progbits + +Ttdata.data: + + .section ".rodata1",#alloc,#progbits + .align 8 +! +! CONSTANT POOL +! + +.L95: + .ascii "invalid distance too far back\000" + .align 8 +! +! CONSTANT POOL +! + +.L147: + .ascii "invalid distance code\000" + .align 8 +! +! CONSTANT POOL +! + +.L153: + .ascii "invalid literal/length code\000" + + .section ".text",#alloc,#execinstr,#progbits +/* 000000 0 */ .align 4 +! FILE deflate-t4.c + +! 1 !#include +! 2 !#include "deflate.h" +! 3 !#define NIL 0 +! 5 !uInt longest_match(s, cur_match) +! 6 ! deflate_state *s; +! 7 ! IPos cur_match; /* current match */ +! 8 !{ + +! +! SUBROUTINE longest_match +! +! OFFSET SOURCE LINE LABEL INSTRUCTION + + .global longest_match + + + longest_match: + + .L900000112: + save %sp, -0xb0, %sp + ld [%i0 + 0x58], %l4 !7 + ldn [%i0 + 0x70], %l5 !7 + and %i1, %l4, %l2 !7 + prefetch [%l5 + %l2], #n_reads !7 +!7 ld [%i0 + 0xac], %l4 + ld [%i0 + 0xac], %l1 !7 + ld [%i0 + 0x50], %l6 + clr %g4 + ldn [%i0 + 0x60], %g1 + ld [%i0 + 0xb8], %i2 + ld [%i0 + 0xbc], %g5 + ld [%i0 + 0xd0], %o0 +!5 srl %l4, 0x0, %l5 + ld [%i0 + 0xcc], %l7 + add %l6, -0x106, %i3 +!5 add %g1, %l5, %i4 +!7 add %g1, %l4, %i4 !5 + add %g1, %l1, %i4 !7 +!7 cwbleu %l4, %i3, lm_0x38 + cwbleu %l1, %i3, lm_0x38 !7 +!7 sub %l4, %i3, %g4 + sub %l1, %i3, %g4 !7 + +lm_0x38: +!7 ld [%i0 + 0x5c], %l4 +!4 add %i2, -0x1, %l3 +!7 ldn [%i0 + 0x70], %l5 +!4 sra %l3, 0x0, %o2 + add %i2, -0x1, %o2 !4 + ldub [%i4 + %o2], %o2 +!3 sra %i2, 0x0, %l6 +!3 ldub [%i4 + %l6], %o1 + ldub [%i4 + %i2], %o1 !3 + cmp %i2, %l7 + add %i4, 0x102, %l7 + ld [%i0 + 0xb4], %i3 + bcs,pn %icc, lm_0x6c + mov 0x102, %l3 + + srl %g5, 0x2, %g5 + +lm_0x6c: + cmp %o0, %i3 +!6 srl %i1, 0x0, %l0 +!7 and %i1, %l4, %l2 !6 + movgu %icc, %i3, %o0 + +lm_0x78: +!6 and %i1, %l4, %l2 +!6 add %l0, %g1, %o3 + add %i1, %g1, %o3 !6 +!3 ldub [%o3 + %l6], %o5 + ldub [%o3 + %i2], %o5 !3 +!1 srl %l2, 0x0, %o4 +!1 sllx %o4, 0x1, %l2 + sllx %l2, 0x1, %l2 !1 + add %l2, %l5, %l1 !1 + prefetch [%l1 - 0x40], #n_reads !1 + cwbe %o5, %o1, lm_0x17c_neg + +lm_0x17c: + lduh [%l5 + %l2], %i1 + cwbleu %i1, %g4, lm_0x190 + + addcc %g5, -0x1, %g5 + bne,pt %icc, lm_0x78 +!6 srl %i1, 0x0, %l0 + and %i1, %l4, %l2 !6 + +lm_0x190: + cmp %i2, %i3 + movgu %icc, %i3, %i2 + return %i7 + 0x8 + srl %o2, 0x0, %o0 + +lm_0x17c_neg: +!3 add %o3, %l6, %o7 + add %o3, %i2, %o7 !3 + ldub [%o7 - 0x1], %l1 + cwbne %l1, %o2, lm_0x17c + +!6 ldub [%g1 + %l0], %i5 + ldub [%g1 + %i1], %i5 !6 + ldub [%i4], %o5 + cwbne %i5, %o5, lm_0x17c + + ldub [%i4 + 0x1], %l1 + ldub [%o3 + 0x1], %o4 + cwbne %o4, %l1, lm_0x17c + + add %o3, 0x2, %o3 +!1 add %l2, %l5, %l1 + add %i4, 0x2, %o4 + +lm_0xc0: + ldub [%o4 + 0x1], %l0 + add %o4, 0x1, %o4 + ldub [%o3 + 0x1], %o7 + cwbne %l0, %o7, lm_0x14c + + ldub [%o4 + 0x1], %i5 + add %o4, 0x1, %o4 + ldub [%o3 + 0x2], %o5 + cwbne %i5, %o5, lm_0x14c + + ldub [%o4 + 0x1], %l0 + add %o4, 0x1, %o4 + ldub [%o3 + 0x3], %o7 + cwbne %l0, %o7, lm_0x14c + + ldub [%o4 + 0x1], %i5 + add %o4, 0x1, %o4 + ldub [%o3 + 0x4], %o5 + cwbne %i5, %o5, lm_0x14c + + ldub [%o4 + 0x1], %l0 + add %o4, 0x1, %o4 + ldub [%o3 + 0x5], %o7 + cwbne %l0, %o7, lm_0x14c + + ldub [%o4 + 0x1], %i5 + add %o4, 0x1, %o4 + ldub [%o3 + 0x6], %o5 + cwbne %i5, %o5, lm_0x14c + + ldub [%o4 + 0x1], %l0 + add %o4, 0x1, %o4 + ldub [%o3 + 0x7], %o7 + cwbne %l0, %o7, lm_0x14c + + ldub [%o4 + 0x1], %i5 + add %o4, 0x1, %o4 + ldub [%o3 + 0x8], %o5 + add %o3, 0x8, %o3 + cwbne %i5, %o5, lm_0x14c + + nop + cxbcs %o4, %l7, lm_0xc0 + +lm_0x14c: +!1 prefetch [%l1 - 0x40], #n_reads + sub %l7, %o4, %l0 + sub %l3, %l0, %o7 + cwble %o7, %i2, lm_0x17c + + st %i1, [%i0 + 0xb0] + mov %o7, %i2 + cwbge %o7, %o0, lm_0x190 + +!2 sra %o7, 0x0, %i1 +!3 sra %o7, 0x0, %l6 +!2 add %i4, %i1, %l1 + add %i4, %o7, %l1 !2 +!2 ldub [%i4 + %i1], %o1 + ldub [%i4 + %o7], %o1 !2 + ba lm_0x17c + ldub [%l1 - 0x1], %o2 + + +/* 0x0220 0 */ .type longest_match,#function +/* 0x0220 0 */ .size longest_match,(.-longest_match) + + + .L900000113: + + .section ".text",#alloc,#execinstr,#progbits +/* 000000 0 */ .align 8 +/* 000000 */ .skip 24 +/* 0x0018 */ .align 4 + + + .L900000286: + + .section ".text",#alloc,#execinstr,#progbits + +! Begin Disassembling Ident + .ident "cg: Sun Compiler Common 12.3 SunOS_sparc 2011/11/16" ! (NO SOURCE LINE) + .ident "acomp: Sun C 5.12 SunOS_sparc 2011/11/16" ! (/tmp/acomp.1329237379.172468.02.sd:24) + .ident "iropt: Sun Compiler Common 12.3 SunOS_sparc 2011/11/16" ! (/tmp/acomp.1329237379.172468.02.sd:25) + .ident "cg: Sun Compiler Common 12.3 SunOS_sparc 2011/11/16" ! (NO SOURCE LINE) +! End Disassembling Ident