/* -----------------------------------------------------------------------------
 * $Id: Prims.hc,v 1.37 1998/06/05 11:16:36 simonm Exp $
 *
 * Primitive functions / data
 *
 * ---------------------------------------------------------------------------*/

#include "Rts.h"
#include "StgStartup.h"
#include "Schedule.h"
#include "RtsUtils.h"

/* ** temporary **

   classes CCallable and CReturnable don't really exist, but the
   compiler insists on generating dictionaries containing references
   to GHC_ZcCCallable_static_info etc., so we provide dummy symbols
   for these.
*/

W_ GHC_ZcCCallable_static_info[];
W_ GHC_ZcCReturnable_static_info[];

#ifndef INTERPRETER_ONLY
#ifndef aix_TARGET_OS /* AIX gives link errors with this as a const (RO assembler section) */
const 
#endif 
      StgClosure *PrelBase_Bool_closure_tbl[] = {
    &False_closure,
    &True_closure
};
#endif

/* -----------------------------------------------------------------------------
   Macros for Hand-written primitives.
   -------------------------------------------------------------------------- */

/*
 * Horrible macros for returning unboxed tuples.
 *
 * How an unboxed tuple is returned depends on two factors:
 *    - the number of real registers we have available
 *    - the boxedness of the returned fields.
 *
 * To return an unboxed tuple from a primitive operation, we have macros
 * RET_<layout> where <layout> describes the boxedness of each field of the
 * unboxed tuple:  N indicates a non-pointer field, and P indicates a pointer.
 *
 * We only define the cases actually used, to avoid having too much
 * garbage in this section.  Warning: any bugs in here will be hard to
 * track down.
 */

/*------ All Regs available */
#ifdef REG_R8
# define RET_P(a)     R1.w = (W_)(a); JMP_(ENTRY_CODE(Sp[0]));
# define RET_N(a)     RET_P(a)

# define RET_PP(a,b)  R1.w = (W_)(a); R2.w = (W_)(b); JMP_(ENTRY_CODE(Sp[0]));
# define RET_NN(a,b)  RET_PP(a,b)
# define RET_NP(a,b)  RET_PP(a,b)

# define RET_PPP(a,b,c) \
	R1.w = (W_)(a); R2.w = (W_)(b); R3.w = (W_)(c); JMP_(ENTRY_CODE(Sp[0]));
# define RET_NNP(a,b,c) RET_PPP(a,b,c)

# define RET_NNNP(a,b,c,d) \
        R1.w = (W_)(a); R2.w = (W_)(b); R3.w = (W_)(c); R4.w = (W_)d; \
        JMP_(ENTRY_CODE(Sp[0]));

# define RET_NNPNNP(a,b,c,d,e,f) \
        R1.w = (W_)(a); R2.w = (W_)(b); R3.w = (W_)(c); \
        R4.w = (W_)(d); R5.w = (W_)(e); R6.w = (W_)(f); \
	JMP_(ENTRY_CODE(Sp[0]));

#else

#if defined(REG_R7) || defined(REG_R6) || defined(REG_R5) || \
    defined(REG_R4) || defined(REG_R3) || defined(REG_R2)
# error RET_n macros not defined for this setup.
#else

/*------ 1 Register available */
#ifdef REG_R1
# define RET_P(a)     R1.w = (W_)(a); JMP_(ENTRY_CODE(Sp[0]));
# define RET_N(a)     RET_P(a)

# define RET_PP(a,b)   R1.w = (W_)(a); Sp[-1] = (W_)(b); Sp -= 1; \
		       JMP_(ENTRY_CODE(Sp[1]));
# define RET_NN(a,b)   R1.w = (W_)(a); Sp[-1] = (W_)(b); Sp -= 2; \
		       JMP_(ENTRY_CODE(Sp[2]));
# define RET_NP(a,b)   RET_PP(a,b)

# define RET_PPP(a,b,c) \
	R1.w = (W_)(a); Sp[-2] = (W_)(b); Sp[-1] = (W_)(c); Sp -= 2; \
	JMP_(ENTRY_CODE(Sp[2]));
# define RET_NNP(a,b,c) \
	R1.w = (W_)(a); Sp[-2] = (W_)(b); Sp[-1] = (W_)(c); Sp -= 3; \
	JMP_(ENTRY_CODE(Sp[3]));

# define RET_NNNP(a,b,c,d)			\
	R1.w = (W_)(a); 			\
    /*  Sp[-5] = ARGTAG(1); */			\
        Sp[-4] = (W_)(b); 			\
    /*  Sp[-3] = ARGTAG(1); */			\
        Sp[-2] = (W_)(c); 			\
        Sp[-1] = (W_)(d); 			\
        Sp -= 5;				\
        JMP_(ENTRY_CODE(Sp[5]));

# define RET_NNPNNP(a,b,c,d,e,f)		\
        R1.w = (W_)(a);				\
	Sp[-1] = (W_)(f);			\
	Sp[-2] = (W_)(e);			\
	/* Sp[-3] = ARGTAG(1); */		\
	Sp[-4] = (W_)(d);			\
	/* Sp[-5] = ARGTAG(1); */		\
	Sp[-6] = (W_)(c);			\
	Sp[-7] = (W_)(b);			\
	/* Sp[-8] = ARGTAG(1); */		\
	Sp -= 8;				\
	JMP_(ENTRY_CODE(Sp[8]));

#else /* 0 Regs available */

#define PUSH_P(o,x) Sp[-o] = (W_)(x)
#define PUSH_N(o,x) Sp[1-o] = (W_)(x); /* Sp[-o] = ARGTAG(1) */
#define PUSHED(m)   Sp -= (m); JMP_(ENTRY_CODE(Sp[m]));

/* Here's how to construct these macros:
 *
 *   N = number of N's in the name;
 *   P = number of P's in the name;
 *   s = N * 2 + P;
 *   while (nonNull(name)) {
 *     if (nextChar == 'P') {
 *       PUSH_P(s,_);
 *       s -= 1;
 *     } else {
 *       PUSH_N(s,_);
 *       s -= 2
 *     }
 *   }
 *   PUSHED(N * 2 + P);
 */

# define RET_P(a)     PUSH_P(1,a); PUSHED(1)
# define RET_N(a)     PUSH_N(2,a); PUSHED(2)

# define RET_PP(a,b)   PUSH_P(2,a); PUSH_P(1,b); PUSHED(2)
# define RET_NN(a,b)   PUSH_N(4,a); PUSH_N(2,b); PUSHED(4)
# define RET_NP(a,b)   PUSH_N(3,a); PUSH_P(1,b); PUSHED(3)

# define RET_PPP(a,b,c) PUSH_P(3,a); PUSH_P(2,b); PUSH_P(1,c); PUSHED(3)
# define RET_NNP(a,b,c) PUSH_N(6,a); PUSH_N(4,b); PUSH_N(2,c); PUSHED(6)

# define RET_NNNP(a,b,c,d) PUSH_N(7,a); PUSH_N(5,b); PUSH_N(3,c); PUSH_P(1,d); PUSHED(7)	
# define RET_NNPNNP(a,b,c,d,e,f) PUSH_N(10,a); PUSH_N(8,b); PUSH_P(6,c); PUSH_N(5,d); PUSH_N(3,e); PUSH_P(1,f); PUSHED(10)

#endif

#endif
#endif

/*-----------------------------------------------------------------------------
  Array Primitives

  Basically just new*Array - the others are all inline macros.

  The size arg is always passed in R1, and the result returned in R1.

  The slow entry point is for returning from a heap check, the saved
  size argument must be re-loaded from the stack.
  -------------------------------------------------------------------------- */

/* for objects that are *less* than the size of a word, make sure we
 * round up to the nearest word for the size of the array.
 */

#define BYTES_TO_STGWORDS(n) ((n) + sizeof(W_) - 1)/sizeof(W_)

#define newByteArray(ty,scale)				\
 FN_(new##ty##ArrayZh_fast)				\
 {							\
   W_ stuff_size, size, n;				\
   StgArrWords* p;					\
   FB_							\
     n = R1.w;						\
     stuff_size = BYTES_TO_STGWORDS(n*scale);		\
     size = sizeofW(StgArrWords)+ stuff_size;		\
     HP_CHK_GEN(size, NO_PTRS, new##ty##ArrayZh_fast,);	\
							\
     /* XXX probably wrong... */			\
     TICK_ALLOC_PRIM(sizeofW(StgArrWords),size,0,size);	\
     CCS_ALLOC(CCCS,size);				\
							\
     p = stgCast(StgArrWords*,Hp-size+1);		\
     SET_HDR(p, &MUT_ARR_WORDS_info, CCCS);		\
     p->words = stuff_size;				\
     RET_P(p);						\
   FE_							\
 }

newByteArray(Char,   sizeof(C_))
newByteArray(Int,    sizeof(I_));
newByteArray(Addr,   sizeof(P_));
newByteArray(Float,  sizeof(StgFloat));
newByteArray(Double, sizeof(StgDouble));

FN_(newArrayZh_fast)
{
  W_ size, n, init;
  StgArrPtrs* arr;
  StgPtr p;
  FB_
    n = R1.w;
    size = sizeofW(StgArrPtrs) + n;
    HP_CHK_GEN(size, R2_PTR, newArrayZh_fast,);
    TICK_ALLOC_PRIM(sizeofW(StgArrPtrs),(n),0,size)
    CCS_ALLOC(CCCS,size);

    arr = stgCast(StgArrPtrs*,Hp-size+1);
    SET_HDR(arr,&MUT_ARR_PTRS_info,CCCS);
    arr->ptrs = n;

    init = R2.w;
    for (p = (P_)arr + sizeofW(StgArrPtrs); 
	 p < (P_)arr + size; p++) {
	*p = (W_)init;
    }

    RET_P(arr);
  FE_
}

FN_(newMutVarZh_fast)
{
  StgMutVar* mv;
  /* Args: R1.p = initialisation value */
  FB_

  HP_CHK_GEN(sizeofW(StgMutVar), R1_PTR, newMutVarZh_fast,);
  TICK_ALLOC_PRIM(sizeofW(StgMutVar),wibble,wibble,wibble)
  CCS_ALLOC(CCCS,sizeofW(StgMutVar));

  mv = stgCast(StgMutVar*,Hp-sizeofW(StgMutVar)+1);
  SET_HDR(mv,&MUT_VAR_info,CCCS);
  mv->var = R1.cl;

  RET_P(mv);

  FE_
}

/* -----------------------------------------------------------------------------
   Foreign Object Primitives

   -------------------------------------------------------------------------- */

#ifndef PAR

StgInt eqForeignObj (StgForeignObj *p1, StgForeignObj *p2);

FN_(makeForeignObjZh_fast)
{
  /* R1.p = ptr to foreign object,
     R2.p = finaliser
  */
  StgForeignObj *result;
  FB_

  HP_CHK_GEN(sizeofW(StgForeignObj), NO_PTRS, makeForeignObjZh_fast,);

  CCS_ALLOC(CCCS,sizeofW(StgForeignObj)); /* ccs prof */

  result = (StgForeignObj *) (Hp + 1 - sizeofW(StgForeignObj));
  SET_HDR(result,&FOREIGN_info,CCCS);
  
  result -> data      = R1.p;
  result -> finaliser = R2.p;
  STGCALL1(newForeignObj,result);

  /* returns (# s#, ForeignObj# #) */
  RET_P(result);
  FE_
}

#endif /* !PAR */

/* -----------------------------------------------------------------------------
   Arbitrary-precision Integer operations.
   -------------------------------------------------------------------------- */

FN_(int2IntegerZh_fast)
{
   /* arguments: R1 = Int# */

   I_ val, s;  		/* to avoid aliasing */
   StgArrWords* p;	/* address of array result */
   FB_

   val = R1.i;
   HP_CHK_GEN(sizeofW(StgArrWords)+1, NO_PTRS, int2IntegerZh_fast,)

   p = stgCast(StgArrWords*,Hp)-1;
   SET_ARR_HDR(p, &ARR_WORDS_info, CCCS, 1);

   /* mpz_set_si is inlined here, makes things simpler */
   if (val < 0) { 
	s  = -1;
	*Hp = -val;
   } else if (val > 0) {
	s = 1;
	*Hp = val;
   } else {
	s = 0;
   }

   /* returns (# alloc :: Int#, 
		 size  :: Int#, 
		 data  :: ByteArray# 
	       #)
   */
   RET_NNP(1,s,p);
   FE_
}

FN_(word2IntegerZh_fast)
{
   /* arguments: R1 = Word# */

   W_ val;  		/* to avoid aliasing */
   I_  s;
   StgArrWords* p;	/* address of array result */
   FB_

   val = R1.w;
   HP_CHK_GEN(sizeofW(StgArrWords)+1, NO_PTRS, word2IntegerZh_fast,)

   p = stgCast(StgArrWords*,Hp)-1;
   SET_ARR_HDR(p, &ARR_WORDS_info, CCCS, 1);

   if (val != 0) {
	s = 1;
	*Hp = val;
   } else {
	s = 0;
   }

   /* returns (# alloc :: Int#, 
		 size  :: Int#, 
		 data  :: ByteArray# 
	       #)
   */
   RET_NNP(1,s,p);
   FE_
}


/* Integer arithmetic */

#ifdef DEBUG
extern nat  gmp_alloc_budget;
#define SET_GMP_ALLOC_BUDGET(n)  gmp_alloc_budget = (n);
#define RESET_GMP_ALLOC_BUDGET() gmp_alloc_budget = 0;
#else
#define SET_GMP_ALLOC_BUDGET(n)  /* nothing */
#define RESET_GMP_ALLOC_BUDGET() /* nothing */
#endif

/* These are quite conservative, but it doesn't matter: we get back
 * any unclaimed heap space after the operation.
 */

#define __abs(a)		(( (a) >= 0 ) ? (a) : (-(a)))
#define GMP_SAME_SIZE(a)	(__abs(a) + sizeofW(StgArrWords) + 16)
#define GMP_MAX_SIZE(a,b)	((__abs(a) > __abs(b) ? __abs(a) : __abs(b)) + 1 + sizeofW(StgArrWords) + 16)
				/* NB: the +1 is for the carry (or whatever) */
#define GMP_2MAX_SIZE(a,b)	(2 * GMP_MAX_SIZE(a,b))
#define GMP_ADD_SIZES(a,b)	(__abs(a) + __abs(b) + 1 + sizeofW(StgArrWords) + 16)
				/* the +1 may just be paranoia */

#define GMP_HEAP_LOOKAHEAD(n, liveness, ret) \
  HP_CHK_GEN(n, liveness, ret,); \
  SAVE_Hp = Hp - (n); \
  TICK_UNALLOC_HEAP(n); \
  SET_GMP_ALLOC_BUDGET(n); 

#define GMP_HEAP_HANDBACK() \
  Hp = SAVE_Hp; \
  RESET_GMP_ALLOC_BUDGET();

/* ToDo: this is shockingly inefficient */

#define GMP_TAKE2_RET1(name,size_macro,mp_fun)				\
FN_(name)								\
{									\
  MP_INT arg1, arg2, result;						\
  I_ space;								\
  I_ a1, s1, a2, s2;							\
  StgArrWords* d1;							\
  StgArrWords* d2;							\
  FB_									\
 									\
  a1 = R1.i;								\
  s1 = R2.i;								\
  d1 = stgCast(StgArrWords*,R3.p);					\
  a2 = R4.i;								\
  s2 = R5.i;								\
  d2 = stgCast(StgArrWords*,R6.p);					\
									\
  space = size_macro(s1,s2);						\
									\
  /* Check that there will be enough heap,				\
   * and make Hp visible to GMP allocator 				\
   */									\
  GMP_HEAP_LOOKAHEAD(space,R3_PTR | R6_PTR,name);			\
									\
  /* Now we can initialise (post possible GC) */			\
  arg1._mp_alloc	= (a1);						\
  arg1._mp_size		= (s1);						\
  arg1._mp_d		= (unsigned long int *) (BYTE_ARR_CTS(d1));	\
  arg2._mp_alloc	= (a2);						\
  arg2._mp_size		= (s2);						\
  arg2._mp_d		= (unsigned long int *) (BYTE_ARR_CTS(d2));	\
									\
  STGCALL1(mpz_init,&result);						\
									\
  /* Perform the operation */						\
  STGCALL3(mp_fun,&result,&arg1,&arg2);					\
									\
  GMP_HEAP_HANDBACK();		/* restore Hp */			\
									\
  RET_NNP(result._mp_alloc, 						\
	  result._mp_size, 						\
          result._mp_d-sizeofW(StgArrWords));				\
  FE_									\
}

#define GMP_TAKE2_RET2(name,size_macro,mp_fun)				\
FN_(name)								\
{									\
  MP_INT arg1, arg2, result1, result2;					\
  I_ space;								\
  I_ a1, s1, a2, s2;							\
  StgArrWords* d1;							\
  StgArrWords* d2;							\
  FB_									\
 									\
  a1 = R1.i;								\
  s1 = R2.i;								\
  d1 = stgCast(StgArrWords*,R3.p);					\
  a2 = R4.i;								\
  s2 = R5.i;								\
  d2 = stgCast(StgArrWords*,R6.p);					\
									\
  space = size_macro(s1,s2);						\
									\
  /* Check that there will be enough heap,				\
   * and make Hp visible to GMP allocator 				\
   */									\
  GMP_HEAP_LOOKAHEAD(space,R3_PTR | R6_PTR,name);			\
									\
  /* Now we can initialise (post possible GC) */			\
  arg1._mp_alloc	= (a1);						\
  arg1._mp_size		= (s1);						\
  arg1._mp_d		= (unsigned long int *) (BYTE_ARR_CTS(d1));	\
  arg2._mp_alloc	= (a2);						\
  arg2._mp_size		= (s2);						\
  arg2._mp_d		= (unsigned long int *) (BYTE_ARR_CTS(d2));	\
									\
  STGCALL1(mpz_init,&result1);						\
  STGCALL1(mpz_init,&result2);						\
									\
  /* Perform the operation */						\
  STGCALL4(mp_fun,&result1,&result2,&arg1,&arg2);			\
									\
  GMP_HEAP_HANDBACK();		/* restore Hp */			\
									\
  RET_NNPNNP(result1._mp_alloc,						\
	     result1._mp_size, 						\
             result1._mp_d-sizeofW(StgArrWords),			\
	     result2._mp_alloc,						\
	     result2._mp_size, 						\
             result2._mp_d-sizeofW(StgArrWords));			\
  FE_									\
}

GMP_TAKE2_RET1(plusIntegerZh_fast,  GMP_MAX_SIZE,  mpz_add);
GMP_TAKE2_RET1(minusIntegerZh_fast, GMP_MAX_SIZE,  mpz_sub);
GMP_TAKE2_RET1(timesIntegerZh_fast, GMP_ADD_SIZES, mpz_mul);

GMP_TAKE2_RET2(quotRemIntegerZh_fast, GMP_2MAX_SIZE, mpz_tdiv_qr);
GMP_TAKE2_RET2(divModIntegerZh_fast,  GMP_2MAX_SIZE, mpz_fdiv_qr);

FN_(addr2IntegerZh_fast)
{
  MP_INT result;
  char *str;
  I_ space;
  FB_

  /* args:   R1 :: Addr# */
  str = R1.a;

  /* taking the number of bytes/8 as the number of words of lookahead
     is plenty conservative */
  space = GMP_SAME_SIZE( RET_STGCALL1(nat, stg_strlen, str) / 8 + 1 );

  GMP_HEAP_LOOKAHEAD(space, NO_PTRS, addr2IntegerZh_fast);

  /* Perform the operation */
  if (RET_STGCALL3(int, mpz_init_set_str,&result,(str),/*base*/10))
      abort();

  GMP_HEAP_HANDBACK();		/* restore Hp */

  RET_NNP(result._mp_alloc, result._mp_size, 
	  result._mp_d - sizeofW(StgArrWords));
  FE_
}

#ifndef FLOATS_AS_DOUBLES
FN_(decodeFloatZh_fast)
{ 
  MP_INT mantissa;
  I_ exponent;
  StgArrWords* p;
  StgFloat arg;
  FB_

  /* arguments: F1 = Float# */
  arg = F1;

  HP_CHK_GEN(sizeof(StgArrWords)+1, NO_PTRS, decodeFloatZh_fast,);

  /* Be prepared to tell Lennart-coded __decodeFloat	*/
  /* where mantissa._mp_d can be put (it does not care about the rest) */
  p = stgCast(StgArrWords*,Hp)-1;
  SET_ARR_HDR(p,&ARR_WORDS_info,CCCS,1)
  mantissa._mp_d = (void *)BYTE_ARR_CTS(p);

  /* Perform the operation */
  STGCALL3(__decodeFloat,&mantissa,&exponent,arg);

  /* returns: (R1 = Int# (expn), R2 = Int#, R3 = Int#, R4 = ByteArray#) */
  RET_NNNP(exponent,mantissa._mp_alloc,mantissa._mp_size,p);
  FE_
}
#endif /* !FLOATS_AS_DOUBLES */

#define DOUBLE_MANTISSA_SIZE (sizeof(StgDouble)/sizeof(W_))
#define ARR_SIZE (sizeof(StgArrWords) + DOUBLE_MANTISSA_SIZE)

FN_(decodeDoubleZh_fast)
{ MP_INT mantissa;
  I_ exponent;
  StgDouble arg;
  StgArrWords* p;
  FB_

  /* arguments: D1 = Double# */
  arg = D1;

  HP_CHK_GEN(ARR_SIZE, NO_PTRS, decodeFloatZh_fast,);

  /* Be prepared to tell Lennart-coded __decodeDouble	*/
  /* where mantissa.d can be put (it does not care about the rest) */
  p = stgCast(StgArrWords*,Hp-ARR_SIZE+1);
  SET_ARR_HDR(p, &ARR_WORDS_info, CCCS, DOUBLE_MANTISSA_SIZE);
  mantissa._mp_d = (void *)BYTE_ARR_CTS(p);

  /* Perform the operation */
  STGCALL3(__decodeDouble,&mantissa,&exponent,arg);

  /* returns: (R1 = Int# (expn), R2 = Int#, R3 = Int#, R4 = ByteArray#) */
  RET_NNNP(exponent,mantissa._mp_alloc,mantissa._mp_size,p);
  FE_
}

/*-----------------------------------------------------------------------------
  Seq frames 

  We don't have a primitive seq# operator: it is just a 'case'
  expression whose scrutinee has either a polymorphic or function type
  (constructor types can be handled by normal 'case' expressions).

  To handle a polymorphic/function typed seq, we push a SEQ_FRAME on
  the stack.  This is a polymorphic activation record that just pops
  itself and returns when entered.  The purpose of the SEQ_FRAME is to
  act as a barrier in case the scrutinee is a partial application - in
  this way it is just like an update frame, except that it doesn't
  update anything.
  -------------------------------------------------------------------------- */

#define SEQ_FRAME_ENTRY_TEMPLATE(label,ret) 	\
   IFN_(label)					\
   {						\
      FB_					\
      Su = stgCast(StgSeqFrame*,Sp)->link;	\
      Sp += sizeofW(StgSeqFrame);		\
      JMP_(ret);				\
      FE_					\
   }

SEQ_FRAME_ENTRY_TEMPLATE(seq_frame_entry,ENTRY_CODE(Sp[0]));
SEQ_FRAME_ENTRY_TEMPLATE(seq_frame_0_entry,RET_VEC(Sp[0],0));
SEQ_FRAME_ENTRY_TEMPLATE(seq_frame_1_entry,RET_VEC(Sp[0],1));
SEQ_FRAME_ENTRY_TEMPLATE(seq_frame_2_entry,RET_VEC(Sp[0],2));
SEQ_FRAME_ENTRY_TEMPLATE(seq_frame_3_entry,RET_VEC(Sp[0],3));
SEQ_FRAME_ENTRY_TEMPLATE(seq_frame_4_entry,RET_VEC(Sp[0],4));
SEQ_FRAME_ENTRY_TEMPLATE(seq_frame_5_entry,RET_VEC(Sp[0],5));
SEQ_FRAME_ENTRY_TEMPLATE(seq_frame_6_entry,RET_VEC(Sp[0],6));
SEQ_FRAME_ENTRY_TEMPLATE(seq_frame_7_entry,RET_VEC(Sp[0],7));

VEC_POLY_INFO_TABLE(seq_frame,1, NULL/*srt*/, 0/*srt_off*/, 0/*srt_len*/, SEQ_FRAME);

/* -----------------------------------------------------------------------------
   Concurrency primitives
   -------------------------------------------------------------------------- */

/* tmp */
#define INIT_STACK_SIZE 1024

FN_(forkZh_fast)
{
  StgTSO *tso;

  FB_
  /* args: R1 = closure to spark */
  
  if (closure_SHOULD_SPARK(stgCast(StgClosure*,R1.p))) {
    tso = (StgTSO *)(Hp+1);
    HP_CHK_GEN(sizeofW(StgTSO) + INIT_STACK_SIZE/sizeof(W_), 
	       R1_PTR, forkZh_fast,);

    /* create it right now */
    STGCALL3(initGenThread, tso, INIT_STACK_SIZE/sizeof(W_), R1.cl);
      
    /* switch at the earliest opportunity */ 
    context_switch = 1;
  }
  
  /* return value: 1 */
  R1.i = 1;
  JMP_(*Sp);

  FE_
}

FN_(newMVarZh_fast)
{
  StgMVar *mvar;

  FB_
  /* args: none */

  HP_CHK_GEN(sizeofW(StgMVar), NO_PTRS, newMVarZh_fast,);
  
  mvar = (StgMVar *) (Hp - sizeofW(StgMVar) + 1);
  SET_INFO(mvar,&EMPTY_MVAR_info);
  mvar->head = mvar->tail = mvar->value = (P_)&END_TSO_QUEUE_closure;

  R1.p = (P_)mvar;

  /* assume it's a direct return; don't want to enter the MVAR */
  JMP_(ENTRY_CODE(Sp[0]));

  FE_
}

FN_(takeMVarZh_fast)
{
  StgMVar *mvar;

  FB_
  /* args: R1 = MVar closure */

  mvar = (StgMVar *)R1.p;

  /* If the MVar is empty, put ourselves on its blocking queue,
   * and wait until we're woken up.
   */
  if (GET_INFO(mvar) != &FULL_MVAR_info) {
    if (mvar->head == (P_)&END_TSO_QUEUE_closure) {
      mvar->head = (P_)CurrentTSO;
    } else {
      ((StgTSO *)mvar->tail)->link = CurrentTSO;
    }
    CurrentTSO->link = (StgTSO *)&END_TSO_QUEUE_closure;
    mvar->tail = (P_)CurrentTSO;

    BLOCK(R1_PTR, takeMVarZh_fast);
  }

  SET_INFO(mvar,&EMPTY_MVAR_info);
  R1.p = mvar->value;
  mvar->value = (P_)&END_TSO_QUEUE_closure;

  JMP_(ENTRY_CODE(Sp[0]));

  FE_
}

FN_(putMVarZh_fast)
{
  StgMVar *mvar;
  StgTSO *tso;

  FB_
  /* args: R1 = MVar, R2 = value */

  mvar = (StgMVar *)R1.p;
  if (GET_INFO(mvar) == &FULL_MVAR_info) {
    fflush(stdout);
    fprintf(stderr, "putMVar#: MVar already full.\n");
    exit(EXIT_FAILURE);
  }
  
  SET_INFO(mvar,&FULL_MVAR_info);
  mvar->value = R2.p;

  /* wake up the first thread on the queue,
   * it will continue with the takeMVar operation and mark the MVar
   * empty again.
   */
  tso = (StgTSO *)mvar->head;
  if (tso != (StgTSO *)&END_TSO_QUEUE_closure) {
    PUSH_ON_RUN_QUEUE(tso);
    mvar->head = (P_)tso->link;
    tso->link = (StgTSO *)&END_TSO_QUEUE_closure;
    if (mvar->head == (P_)&END_TSO_QUEUE_closure) {
      mvar->tail = (P_)&END_TSO_QUEUE_closure;
    }
  }

  /* ToDo: yield here for better communication performance? */
  JMP_(ENTRY_CODE(*Sp));

  FE_
}

/* -----------------------------------------------------------------------------
   Exception Primitives
   -------------------------------------------------------------------------- */

STGFUN(raise_entry);
FN_(catchZh_fast);
FN_(raiseZh_fast);
static const StgInfoTable raise_info;

#define CATCH_FRAME_ENTRY_TEMPLATE(label,ret) 	\
   FN_(label);					\
   FN_(label)					\
   {						\
      FB_					\
      Su = ((StgCatchFrame *)Sp)->link;		\
      Sp += sizeofW(StgCatchFrame);		\
      JMP_(ret);				\
      FE_					\
   }

CATCH_FRAME_ENTRY_TEMPLATE(catch_frame_entry,ENTRY_CODE(Sp[0]));
CATCH_FRAME_ENTRY_TEMPLATE(catch_frame_0_entry,RET_VEC(Sp[0],0));
CATCH_FRAME_ENTRY_TEMPLATE(catch_frame_1_entry,RET_VEC(Sp[0],1));
CATCH_FRAME_ENTRY_TEMPLATE(catch_frame_2_entry,RET_VEC(Sp[0],2));
CATCH_FRAME_ENTRY_TEMPLATE(catch_frame_3_entry,RET_VEC(Sp[0],3));
CATCH_FRAME_ENTRY_TEMPLATE(catch_frame_4_entry,RET_VEC(Sp[0],4));
CATCH_FRAME_ENTRY_TEMPLATE(catch_frame_5_entry,RET_VEC(Sp[0],5));
CATCH_FRAME_ENTRY_TEMPLATE(catch_frame_6_entry,RET_VEC(Sp[0],6));
CATCH_FRAME_ENTRY_TEMPLATE(catch_frame_7_entry,RET_VEC(Sp[0],7));

/* Catch frames are very similar to update frames, but when entering
 * one we just pop the frame off the stack and perform the correct
 * kind of return to the activation record underneath us on the stack.
 */

VEC_POLY_INFO_TABLE(catch_frame,1, NULL/*srt*/, 0/*srt_off*/, 0/*srt_len*/, CATCH_FRAME);

FN_(catchZh_fast)
{
  StgCatchFrame *fp;
  FB_

    /* args: R1 = m, R2 = k */
    STK_CHK_GEN(sizeofW(StgCatchFrame), R1_PTR | R2_PTR, catchZh_fast, );
    Sp -= sizeofW(StgCatchFrame);
    fp = stgCast(StgCatchFrame*,Sp);
    SET_HDR(fp,(StgInfoTable *)&catch_frame_info,CC_catch);
    fp -> handler = R2.cl;
    fp -> link = Su;
    Su = stgCast(StgUpdateFrame*,fp);
    
#if PROFILING
    fp->header.prof.ccs   = CCCS;
#endif
    
    TICK_ENT_VIA_NODE();
    JMP_(ENTRY_CODE(*R1.p));         
    
  FE_
}      

// The raise infotable
// 
// This should be exactly the same as would be generated by this STG code
//
//   raise = {err} \n {} -> raise#{err}
//
// It is used in raiseZh_fast to update thunks on the update list
INFO_TABLE(raise_info,raise_entry,1,0,FUN,static const,IF_,0,0);
STGFUN(raise_entry)
{
  FB_
  R1.cl = payloadCPtr(R1.cl,0);
  JMP_(raiseZh_fast);
  FE_
}

FN_(raiseZh_fast)
{
  StgClosure *handler;
  StgUpdateFrame *p;
  FB_
    /* args : R1 = error */

    p = Su;

    while (1) {

      switch (get_itbl(p)->type) {

      case UPDATE_FRAME:
	UPD_INPLACE1(p->updatee,&raise_info,R1.cl);
	p = p->link;
	continue;

      case SEQ_FRAME:
	p = stgCast(StgSeqFrame*,p)->link;
	continue;

      case CATCH_FRAME:
	/* found it! */
	break;

      case STOP_FRAME:
	barf("raiseZh_fast: STOP_FRAME");

      default:
	barf("raiseZh_fast: weird activation record");
      }
      
      break;

    }
    
    /* Ok, p points to the enclosing CATCH_FRAME.  Pop everything down to
     * and including this frame, update Su, push R1, and enter the handler.
     */
    Su = ((StgCatchFrame *)p)->link; 
    handler = ((StgCatchFrame *)p)->handler;
    
    Sp = stgCast(StgPtr,p) + sizeofW(StgCatchFrame) - 1;
    *Sp = R1.w;

    TICK_ENT_VIA_NODE();
    R1.cl = handler;
    JMP_(ENTRY_CODE(handler->header.info));
    
  FE_
}

