Browse Source

[pk] various PK cleanups/speedups

cs250
Andrew Waterman 16 years ago
parent
commit
783c0ec831
  1. 97
      pk/fp.c
  2. 13
      pk/fp.h
  3. 84
      pk/fp_asm.S
  4. 39
      pk/memset.c
  5. 1
      pk/pk.c
  6. 6
      pk/pk.h
  7. 2
      pk/pk.ld
  8. 4
      pk/pk.mk.in
  9. 2
      pk/riscv-pk.c
  10. 26
      pk/strlen.c

97
pk/fp.c

@ -2,6 +2,7 @@
#include "softfloat.h"
#include "riscv-opc.h"
#include "pk.h"
#include "fp.h"
#include <stdint.h>
#define noisy 0
@ -10,8 +11,6 @@ static void set_fp_reg(unsigned int which, unsigned int dp, uint64_t val);
static uint64_t get_fp_reg(unsigned int which, unsigned int dp);
static fp_state_t fp_state;
static void get_fp_state();
static void put_fp_state();
static inline void
validate_address(trapframe_t* tf, long addr, int size, int store)
@ -24,9 +23,8 @@ validate_address(trapframe_t* tf, long addr, int size, int store)
int emulate_fp(trapframe_t* tf)
{
fp_state.fsr = mfcr(CR_FSR);
if(have_fp)
get_fp_state();
fp_state.fsr = get_fp_state(fp_state.fpr);
if(noisy)
printk("FPU emulation at pc %lx, insn %x\n",tf->epc,(uint32_t)tf->insn);
@ -45,9 +43,9 @@ int emulate_fp(trapframe_t* tf)
#define XRS2 (tf->gpr[RRS2])
#define XRDR (tf->gpr[RRD])
uint64_t frs1d = get_fp_reg(RRS1, 1);
uint64_t frs2d = get_fp_reg(RRS2, 1);
uint64_t frs3d = get_fp_reg(RRS3, 1);
uint64_t frs1d = fp_state.fpr[RRS1];
uint64_t frs2d = fp_state.fpr[RRS2];
uint64_t frs3d = fp_state.fpr[RRS3];
uint32_t frs1s = get_fp_reg(RRS1, 0);
uint32_t frs2s = get_fp_reg(RRS2, 0);
uint32_t frs3s = get_fp_reg(RRS3, 0);
@ -217,9 +215,8 @@ int emulate_fp(trapframe_t* tf)
else
return -1;
mtcr(fp_state.fsr, CR_FSR);
if(have_fp)
put_fp_state();
put_fp_state(fp_state.fpr,fp_state.fsr);
advance_pc(tf);
@ -231,6 +228,8 @@ int emulate_fp(trapframe_t* tf)
#define PUT_FP_REG(which, type, val) asm("mtf." STR(type) " $f" STR(which) ",%0" : : "r"(val))
#define GET_FP_REG(which, type, val) asm("mff." STR(type) " %0,$f" STR(which) : "=r"(val))
#define LOAD_FP_REG(which, type, val) asm("l." STR(type) " $f" STR(which) ",%0" : : "m"(val))
#define STORE_FP_REG(which, type, val) asm("s." STR(type) " $f" STR(which) ",%0" : "=m"(val) : : "memory")
static void __attribute__((noinline))
set_fp_reg(unsigned int which, unsigned int dp, uint64_t val)
@ -248,11 +247,8 @@ set_fp_reg(unsigned int which, unsigned int dp, uint64_t val)
// to set an SP value, move the SP value into the FPU
// then move it back out as a DP value. OK to clobber $f0
// because we'll restore it later.
uint64_t tmp;
GET_FP_REG(0,d,tmp);
PUT_FP_REG(0,s,val);
GET_FP_REG(0,d,fp_state.fpr[which]);
PUT_FP_REG(0,d,tmp);
}
}
@ -267,11 +263,8 @@ get_fp_reg(unsigned int which, unsigned int dp)
// to get an SP value, move the DP value into the FPU
// then move it back out as an SP value. OK to clobber $f0
// because we'll restore it later.
uint64_t tmp;
GET_FP_REG(0,d,tmp);
PUT_FP_REG(0,d,fp_state.fpr[which]);
GET_FP_REG(0,s,val);
PUT_FP_REG(0,d,tmp);
}
if(noisy)
@ -283,83 +276,11 @@ get_fp_reg(unsigned int which, unsigned int dp)
return val;
}
static void __attribute__((noinline)) get_fp_state()
{
GET_FP_REG(0, d, fp_state.fpr[0]);
GET_FP_REG(1, d, fp_state.fpr[1]);
GET_FP_REG(2, d, fp_state.fpr[2]);
GET_FP_REG(3, d, fp_state.fpr[3]);
GET_FP_REG(4, d, fp_state.fpr[4]);
GET_FP_REG(5, d, fp_state.fpr[5]);
GET_FP_REG(6, d, fp_state.fpr[6]);
GET_FP_REG(7, d, fp_state.fpr[7]);
GET_FP_REG(8, d, fp_state.fpr[8]);
GET_FP_REG(9, d, fp_state.fpr[9]);
GET_FP_REG(10, d, fp_state.fpr[10]);
GET_FP_REG(11, d, fp_state.fpr[11]);
GET_FP_REG(12, d, fp_state.fpr[12]);
GET_FP_REG(13, d, fp_state.fpr[13]);
GET_FP_REG(14, d, fp_state.fpr[14]);
GET_FP_REG(15, d, fp_state.fpr[15]);
GET_FP_REG(16, d, fp_state.fpr[16]);
GET_FP_REG(17, d, fp_state.fpr[17]);
GET_FP_REG(18, d, fp_state.fpr[18]);
GET_FP_REG(19, d, fp_state.fpr[19]);
GET_FP_REG(20, d, fp_state.fpr[20]);
GET_FP_REG(21, d, fp_state.fpr[21]);
GET_FP_REG(22, d, fp_state.fpr[22]);
GET_FP_REG(23, d, fp_state.fpr[23]);
GET_FP_REG(24, d, fp_state.fpr[24]);
GET_FP_REG(25, d, fp_state.fpr[25]);
GET_FP_REG(26, d, fp_state.fpr[26]);
GET_FP_REG(27, d, fp_state.fpr[27]);
GET_FP_REG(28, d, fp_state.fpr[28]);
GET_FP_REG(29, d, fp_state.fpr[29]);
GET_FP_REG(30, d, fp_state.fpr[30]);
GET_FP_REG(31, d, fp_state.fpr[31]);
}
static void __attribute__((noinline)) put_fp_state()
{
PUT_FP_REG(0, d, fp_state.fpr[0]);
PUT_FP_REG(1, d, fp_state.fpr[1]);
PUT_FP_REG(2, d, fp_state.fpr[2]);
PUT_FP_REG(3, d, fp_state.fpr[3]);
PUT_FP_REG(4, d, fp_state.fpr[4]);
PUT_FP_REG(5, d, fp_state.fpr[5]);
PUT_FP_REG(6, d, fp_state.fpr[6]);
PUT_FP_REG(7, d, fp_state.fpr[7]);
PUT_FP_REG(8, d, fp_state.fpr[8]);
PUT_FP_REG(9, d, fp_state.fpr[9]);
PUT_FP_REG(10, d, fp_state.fpr[10]);
PUT_FP_REG(11, d, fp_state.fpr[11]);
PUT_FP_REG(12, d, fp_state.fpr[12]);
PUT_FP_REG(13, d, fp_state.fpr[13]);
PUT_FP_REG(14, d, fp_state.fpr[14]);
PUT_FP_REG(15, d, fp_state.fpr[15]);
PUT_FP_REG(16, d, fp_state.fpr[16]);
PUT_FP_REG(17, d, fp_state.fpr[17]);
PUT_FP_REG(18, d, fp_state.fpr[18]);
PUT_FP_REG(19, d, fp_state.fpr[19]);
PUT_FP_REG(20, d, fp_state.fpr[20]);
PUT_FP_REG(21, d, fp_state.fpr[21]);
PUT_FP_REG(22, d, fp_state.fpr[22]);
PUT_FP_REG(23, d, fp_state.fpr[23]);
PUT_FP_REG(24, d, fp_state.fpr[24]);
PUT_FP_REG(25, d, fp_state.fpr[25]);
PUT_FP_REG(26, d, fp_state.fpr[26]);
PUT_FP_REG(27, d, fp_state.fpr[27]);
PUT_FP_REG(28, d, fp_state.fpr[28]);
PUT_FP_REG(29, d, fp_state.fpr[29]);
PUT_FP_REG(30, d, fp_state.fpr[30]);
PUT_FP_REG(31, d, fp_state.fpr[31]);
}
void init_fp_regs()
{
long sr = mfpcr(PCR_SR);
mtpcr(sr | SR_EF, PCR_SR);
put_fp_state();
put_fp_state(fp_state.fpr,fp_state.fsr);
mtpcr(sr, PCR_SR);
}

13
pk/fp.h

@ -0,0 +1,13 @@
#ifndef _FP_H
#define _FP_H
typedef struct
{
uint64_t fpr[32];
uint32_t fsr;
} fp_state_t;
void put_fp_state(const void* fp_regs, long fsr);
long get_fp_state(void* fp_regs);
#endif

84
pk/fp_asm.S

@ -0,0 +1,84 @@
#include "pcr.h"
.text
.globl get_fp_state
.ent get_fp_state
get_fp_state:
mfcr $v0, ASM_CR(CR_FSR)
s.d $f0 , 0($a0)
s.d $f1 , 8($a0)
s.d $f2 , 16($a0)
s.d $f3 , 24($a0)
s.d $f4 , 32($a0)
s.d $f5 , 40($a0)
s.d $f6 , 48($a0)
s.d $f7 , 56($a0)
s.d $f8 , 64($a0)
s.d $f9 , 72($a0)
s.d $f10, 80($a0)
s.d $f11, 88($a0)
s.d $f12, 96($a0)
s.d $f13,104($a0)
s.d $f14,112($a0)
s.d $f15,120($a0)
s.d $f16,128($a0)
s.d $f17,136($a0)
s.d $f18,144($a0)
s.d $f19,152($a0)
s.d $f20,160($a0)
s.d $f21,168($a0)
s.d $f22,176($a0)
s.d $f23,184($a0)
s.d $f24,192($a0)
s.d $f25,200($a0)
s.d $f26,208($a0)
s.d $f27,216($a0)
s.d $f28,224($a0)
s.d $f29,232($a0)
s.d $f30,240($a0)
s.d $f31,248($a0)
.end get_fp_state
.globl put_fp_state
.ent put_fp_state
put_fp_state:
l.d $f0 , 0($a0)
l.d $f1 , 8($a0)
l.d $f2 , 16($a0)
l.d $f3 , 24($a0)
l.d $f4 , 32($a0)
l.d $f5 , 40($a0)
l.d $f6 , 48($a0)
l.d $f7 , 56($a0)
l.d $f8 , 64($a0)
l.d $f9 , 72($a0)
l.d $f10, 80($a0)
l.d $f11, 88($a0)
l.d $f12, 96($a0)
l.d $f13,104($a0)
l.d $f14,112($a0)
l.d $f15,120($a0)
l.d $f16,128($a0)
l.d $f17,136($a0)
l.d $f18,144($a0)
l.d $f19,152($a0)
l.d $f20,160($a0)
l.d $f21,168($a0)
l.d $f22,176($a0)
l.d $f23,184($a0)
l.d $f24,192($a0)
l.d $f25,200($a0)
l.d $f26,208($a0)
l.d $f27,216($a0)
l.d $f28,224($a0)
l.d $f29,232($a0)
l.d $f30,240($a0)
l.d $f31,248($a0)
mtcr $a1, ASM_CR(CR_FSR)
.end put_fp_state

39
pk/memset.c

@ -0,0 +1,39 @@
#include <stdlib.h>
#include <limits.h>
#include <string.h>
void* memset(void* m, int ch, size_t s)
{
char* mem = (char*)m;
while(((long)m & (sizeof(long)-1)) && s)
{
*mem++ = ch;
s--;
}
long l = ch & 0xFF;
l = l | (l << 8);
l = l | (l << 16);
if(sizeof(long) == 8)
l = l | (l << 32);
else if(sizeof(long) != 4)
abort();
long* lmem = (long*)mem;
for(size_t i = 0; i < (s+sizeof(long)-1)/sizeof(long)*sizeof(long); i += 8)
{
lmem[i+0] = l;
lmem[i+1] = l;
lmem[i+2] = l;
lmem[i+3] = l;
lmem[i+4] = l;
lmem[i+5] = l;
lmem[i+6] = l;
lmem[i+7] = l;
}
for(size_t i = (s+sizeof(long)-1)/sizeof(long)*sizeof(long); i < s; i++)
mem[i] = ch;
return m;
}

1
pk/pk.c

@ -145,6 +145,7 @@ static void mainvars_init()
static void jump_usrstart()
{
printk("strlen(\"\") = %d\n",strlen(""));
trapframe_t tf;
init_tf(&tf, USER_START, USER_MEM_SIZE-USER_MAINVARS_SIZE);
pop_tf(&tf);

6
pk/pk.h

@ -13,12 +13,6 @@ typedef struct
long insn;
} trapframe_t;
typedef struct
{
uint64_t fpr[32];
uint32_t fsr;
} fp_state_t;
#define USER_MEM_SIZE 0x70000000
#define USER_MAINVARS_SIZE 0x1000
#define USER_START 0x10000

2
pk/pk.ld

@ -2,8 +2,6 @@ OUTPUT_ARCH( "mips:riscv" )
ENTRY( __start )
GROUP( -lc -lgcc -lgloss )
SECTIONS
{

4
pk/pk.mk.in

@ -5,6 +5,7 @@ pk_subproject_deps = \
pk_hdrs = \
pk.h \
pcr.h \
fp.h \
atomic.h \
file.h \
frontend.h \
@ -17,9 +18,12 @@ pk_c_srcs = \
handlers.c \
frontend.c \
fp.c \
memset.c \
strlen.c \
pk_asm_srcs = \
entry.S \
fp_asm.S \
pk_test_srcs =

2
pk/riscv-pk.c

@ -22,7 +22,7 @@ void __attribute__((section(".boottext"))) __start()
#endif
mtpcr(sr0 | SR_EF, PCR_SR);
have_fp = 0;//mfpcr(PCR_SR) & SR_EF;
have_fp = mfpcr(PCR_SR) & SR_EF;
mtpcr(sr0, PCR_SR);
extern void boot();

26
pk/strlen.c

@ -0,0 +1,26 @@
#include <string.h>
#include <stdlib.h>
// from http://www-graphics.stanford.edu/~seander/bithacks.html
static inline long hasZeroByte(long l)
{
if(sizeof(long) == 4)
return (l - 0x01010101UL) & ~l & 0x80808080UL;
else if(sizeof(long) == 8)
return (l - 0x0101010101010101UL) & ~l & 0x8080808080808080UL;
}
size_t strlen(const char* s)
{
size_t i = 0;
// use optimized version if string starts on a long boundary
if(((long)s & (sizeof(long)-1)) == 0)
while(!hasZeroByte(*(long*)(s+i)))
i += sizeof(long);
while(s[i])
i++;
return i;
}
Loading…
Cancel
Save