/* * ANS Forth core engine. This program implements a Forth interpreter, * along with native implementations of some of the ANS Forth standard * words. Along with a Forth implementation of other words, this * program provides a functional Forth implementation. * * This Forth system has the following characteristics: * -- it is written in standard C and should run on many machine types * (there are some conditions which are met by virtually all modern * desktop computers); * -- it implements a virtual machine, whose memory size and natural * data length are configurable; * -- it checks all memory accesses and never crashes, whatever the * Forth code does (instead of crashing, the program quits gracefully); * -- it is optimized not for speed but for conciseness and clarity. * * This file is in Public Domain. */ /* * This program works on machines with the following characteristics: * -- two's complement arithmetics which truncates data on overflow and * underflow; * -- no trap representation nor invalid encodings for arithmetic types; * -- 8-bit bytes (CHAR_BIT == 8); * -- at least one suitable integer type: its size must be a power of 2, * greater than or equal to 2 bytes. * * The source files are interpreted as ASCII (or anything which is * ASCII-compatible), regardless of the host charset; however, fgets() * is used to process input source files, hence the host charset must * not conflict too badly with ASCII, especially for end-of-line * indicators. */ #include #include #include #include #include /* * Generic failure function. Whenever something goes wrong, we print * an error message on stderr, (syntax is similar to printf()) and then * we exit. */ static void panic(char *fmt, ...) { va_list ap; va_start(ap, fmt); fprintf(stderr, "panic: "); vfprintf(stderr, fmt, ap); fprintf(stderr, "\n"); va_end(ap); exit(EXIT_FAILURE); } /* * Cell type definition. This is the basic Forth data type; a cell must * be wide enough to encode all Forth addresses. The cell type must have * a length which is a power of 2, greater than or equal to 2. * * The default type is "long", which will yield a 32-bit or a 64-bit Forth * on most machines. * * The Forth "address units" are C "unsigned char" (bytes) and a character * always uses exactly one address unit. */ typedef long cell; typedef unsigned long ucell; #define DCELL "%ld" #define UCELL "%lu" #define XCELL "0x%08lX" #define CELLMIN LONG_MIN #define CELLMAX LONG_MAX #define UCELLMAX ULONG_MAX /* * Helper macros. */ #define CELLSIZE (sizeof(cell)) #define CELLS(x) ((x) * CELLSIZE) /* * The "dictionary" is the whole virtual machine memory area. Its * length is expressed in bytes. Within the Forth system, an address * is an offset in that area. * * The word headers, names and definitions go to the beginning of that * area. The data and return stack are located at the end. The input * buffer is dynamically allocated at the end, just below the two stacks. */ #define DICTIONARY_LEN 1048576 /* bytes */ #define DATA_STACK_LEN 1152 /* cells */ #define RETURN_STACK_LEN 1024 /* cells */ #define PAD_LEN 256 /* bytes */ #define MIN_LINE_LEN 128 /* bytes */ /* * The dictionary is a static array. Modern Unix systems will allocate * pages when they are accessed, hence there is no problem in having * a large dictionary area. We use a "union" definition in order to * ensure cell alignment. */ static union { unsigned char b[DICTIONARY_LEN]; cell c[DICTIONARY_LEN / CELLSIZE]; } dictionary; /* * Stacks grow towards lower addresses. The stack pointers point to the * TOS (Top Of Stack). */ #define DATA_STACK_BEGIN (DICTIONARY_LEN - DATA_STACK_LEN) #define DATA_STACK_END DICTIONARY_LEN #define RETURN_STACK_BEGIN (DATA_STACK_BEGIN - RETURN_STACK_LEN) #define RETURN_STACK_END DATA_STACK_BEGIN #define RESERVED_BEGIN RETURN_STACK_BEGIN /* * The dictionary begins with some reserved data; those cells contain * the data which is needed for proper communication between the C world * and the Forth world. * * Memory layout: * constant -1 1 cell used for CREATEd words (-1 == OP_EXIT) * BASE 1 cell contains the current parsing base * HERE pointer 1 cell contains the value of HERE * current xt ptr 1 cell points to the currently defined code * last word ptr 1 cell points to the last defined word * source ID 1 cell contains the source ID * source buffer ptr 1 cell points to the source buffer * source buffer length 1 cell contains the source buffer length * >IN 1 cell >IN value * STATE 1 cell -1 when compiling, 0 otherwise. * current wordlist 1 cell pointer to current definition wordlist * current search 1 cell pointer to current search top wordlist */ /* * These macros contain the reserved data field addresses. */ #define FA_BASE CELLSIZE #define FA_HERE (FA_BASE + CELLSIZE) #define FA_CURRENT_XT (FA_HERE + CELLSIZE) #define FA_LAST_WORD (FA_CURRENT_XT + CELLSIZE) #define FA_SOURCE_ID (FA_LAST_WORD + CELLSIZE) #define FA_SOURCE_PTR (FA_SOURCE_ID + CELLSIZE) #define FA_SOURCE_LEN (FA_SOURCE_PTR + CELLSIZE) #define FA_TOIN (FA_SOURCE_LEN + CELLSIZE) #define FA_STATE (FA_TOIN + CELLSIZE) #define FA_CURRENT_LIST (FA_STATE + CELLSIZE) #define FA_CURRENT_SEARCH (FA_CURRENT_LIST + CELLSIZE) #define FA_EORA (FA_CURRENT_SEARCH + CELLSIZE) /* * Those macros implement data fetch and store, both for cells and * bytes. They are used when the address is known to be valid and * correctly aligned (e.g., for the reserved field addresses). */ #define FETCH(addr) (*(cell *)(dictionary.b + (addr))) #define STORE(addr, value) (*(cell *)(dictionary.b + (addr)) = (value)) #define FETCH8(addr) (*(dictionary.b + (addr))) #define STORE8(addr, value) (*(dictionary.b + (addr)) = (value)) /* * Fetch the cell value at the provided address. Panic on unaligned * or out-of-range address. */ static cell fetch(ucell addr) { if ((addr & (CELLSIZE - 1)) != 0) panic("unaligned read access: " XCELL, addr); if (addr > DICTIONARY_LEN - CELLSIZE) panic("read access (cell) to invalid address: " XCELL, addr); return FETCH(addr); } /* * Store the cell "value" at the provided address. Panic on unaligned * or out-of-range address. */ static void store(ucell addr, cell value) { if ((addr & (CELLSIZE - 1)) != 0) panic("unaligned write access: " XCELL, addr); if (addr > DICTIONARY_LEN - CELLSIZE) panic("write access (cell) to invalid address: " XCELL, addr); STORE(addr, value); } /* * Fetch the byte at the provided address. Panic on out-of-range address. */ static cell fetch8(ucell addr) { if (addr >= DICTIONARY_LEN) panic("read access (byte) to invalid address: " XCELL, addr); return FETCH8(addr); } /* * Store the byte "value" at the provided address. Panic on out-of-range * address. The value is suitably truncated. */ static void store8(ucell addr, cell value) { if (addr >= DICTIONARY_LEN) panic("write access (byte) to invalid address: " XCELL, addr); STORE8(addr, value); } /* * Data and return stack pointers are kept as native C pointers. They * are always cell-aligned. */ static cell *data_stack_ptr, *return_stack_ptr; /* * Push a value on the data stack. Panic on stack overflow (stack pointer * out of range). */ static void push(cell value) { if ((unsigned char *)data_stack_ptr <= &dictionary.b[DATA_STACK_BEGIN]) panic("data stack overflow (push)"); *-- data_stack_ptr = value; } /* * Pop a value from the data stack. Panic on stack overflow (stack pointer * out of range) or unaligned stack pointer. */ static cell pop(void) { if ((unsigned char *)data_stack_ptr >= &dictionary.b[DATA_STACK_END]) panic("data stack underflow (pop)"); return *data_stack_ptr ++; } /* * Get a copy of a value in the data stack (element 0 is the TOS, element 1 * is the one just below,...). Panic on stack underflow. */ static cell look(ucell elt) { if ((ucell)(&dictionary.c[DATA_STACK_END / CELLSIZE] - data_stack_ptr) <= elt) panic("data stack underflow (look)"); return *(data_stack_ptr + elt); } /* * Push a value on the return stack. Panic on stack overflow. */ static void push_ret(cell value) { if ((unsigned char *)return_stack_ptr <= &dictionary.b[RETURN_STACK_BEGIN]) panic("return stack overflow (push)"); *-- return_stack_ptr = value; } /* * Pop a value from the return stack. Panic on stack overflow (stack pointer * out of range) or unaligned stack pointer. */ static cell pop_ret(void) { if ((unsigned char *)return_stack_ptr >= &dictionary.b[RETURN_STACK_END]) panic("return stack underflow (pop)"); return *return_stack_ptr ++; } /* * Get a copy of a value in the return stack (element 0 is the TOS, element 1 * is the one just below,...). Panic on stack underflow. */ static cell look_ret(ucell elt) { if ((ucell)(&dictionary.c[RETURN_STACK_END / CELLSIZE] - return_stack_ptr) <= elt) panic("return stack underflow (look)"); return *(return_stack_ptr + elt); } /* * Check that the provided range (address and length) fits completely * within the dictionary area. Panic if this is not true. */ static void check_range(ucell addr, ucell len) { if (len > DICTIONARY_LEN || addr > DICTIONARY_LEN || (addr + len) > DICTIONARY_LEN) panic("invalid range: " XCELL "[" UCELL "]", addr, len); } /* * Get a native (C) pointer to the byte whose address is provided as * argument. This is used along with check_range(), in order to use * native C functions such as memcpy() with Forth addresses. */ static void * native_address(ucell addr) { return &dictionary.b[addr]; } #ifdef DEBUG /* * Write a Forth string (provided as address and length, with no * zero-byte terminator) to stderr. */ static void logword(ucell addr, ucell len) { fwrite(native_address(addr), 1, len, stderr); } #endif /* * Data stack depth. */ static ucell data_stack_depth() { return &dictionary.c[DATA_STACK_END / CELLSIZE] - data_stack_ptr; } #if 0 /* currently unused */ /* * Return stack depth. */ static ucell return_stack_depth() { return &dictionary.c[RETURN_STACK_END / CELLSIZE] - return_stack_ptr; } #endif /* * Native SWAP */ static void swap(void) { cell tmp; if (data_stack_depth() < 2) panic("data stack underflow (swap)"); tmp = *data_stack_ptr; *data_stack_ptr = *(data_stack_ptr + 1); *(data_stack_ptr + 1) = tmp; } /* * Native ROT */ static void rot(void) { cell tmp; if (data_stack_depth() < 3) panic("data stack underflow (rot)"); tmp = *(data_stack_ptr + 2); *(data_stack_ptr + 2) = *(data_stack_ptr + 1); *(data_stack_ptr + 1) = *data_stack_ptr; *data_stack_ptr = tmp; } /* * Native -ROT (not in ANS Forth; may be defined as such: : -ROT ROT ROT ; ) */ static void notrot(void) { cell tmp; if (data_stack_depth() < 3) panic("data stack underflow (-rot)"); tmp = *data_stack_ptr; *data_stack_ptr = *(data_stack_ptr + 1); *(data_stack_ptr + 1) = *(data_stack_ptr + 2); *(data_stack_ptr + 2) = tmp; } /* * Native DUP */ static void dup(void) { push(look(0)); } /* * Native OVER */ static void over(void) { push(look(1)); } /* * Native ROLL */ static void roll(void) { ucell cc = pop(); cell tmp; if (cc == 0) return; tmp = look(cc); memmove(data_stack_ptr + 1, data_stack_ptr, CELLS(cc)); *data_stack_ptr = tmp; } /* * Native PICK */ static void pick(void) { ucell cc = pop(); push(look(cc)); } /* * Native ALLOT * We check that the memory may indeed be allocated: when the dictionary * area grows, it must leave enough room for the PAD, which must not * overlap with the source buffer. */ static void allot(cell num) { ucell here; if (num == 0) return; here = FETCH(FA_HERE); if (here > DICTIONARY_LEN) panic("HERE corrupted"); if (num > 0 && ((ucell)num + here + PAD_LEN) > (ucell)FETCH(FA_SOURCE_PTR)) panic("dictionary is full"); STORE(FA_HERE, here + num); } /* * Native ALIGN */ static void align(void) { ucell here = FETCH(FA_HERE); ucell ua = here % CELLSIZE; if (ua == 0) return; allot(CELLSIZE - ua); } /* * Compute an aligned address: this is the first cell-aligned address * greater than or equal to the provided address. */ static ucell align_address(ucell addr) { ucell ua = addr % CELLSIZE; if (ua == 0) return addr; return addr + CELLSIZE - ua; } /* * Words. * * A word is defined by a header in the dictionary area. The header * contains the following data: * * name length 1 cell name length in characters * CFA-I 1 cell interpretation semantics * CFA-C 1 cell compilation semantics * LFA 1 cell link field (for wordlists) * name x cells name + padding (for alignment) * * CFA-I points to the interpretation code. * CFA-C points to the compilation code. If that field contains 0, then * the word has default compilation semantics. An immediate word is * a word where CFA-C contains the same value than CFA-I. * LFA is a pointer use to keep words linked in the wordlist. * * Following the header is the name itself. * * Older Forth (pre-ANS) had a completely specified word header format, * and application code relied on it. ANS Forth has thrown all this into * the "ambiguous condition" dark area, mostly so that native code generator * could be implemented. The word header we specify here is NOT the one used * in older Forth implementations. Traditionnal Forth systems use a unique * CFA and an "immediate" flag, but this leads to problems when a word such * as TO must be defined: it has both interpretation and non-default * compilation semantics. With only one CFA and one immediate flag, TO * can only be implemented as a "state-smart" immediate word (a word which * tests the value of STATE); such words may fail when ticked. */ /* * Those macros contain the word header field offsets within the header. * No code (neither C nor Forth) relies on the precise field order. */ #define WH_NAMELEN 0 #define WH_CFA_I CELLS(1) #define WH_CFA_C CELLS(2) #define WH_LFA CELLS(3) #define WH_NAME CELLS(4) /* * An execution token (xt) is a cell value which can reference: * -- a native word implementation * -- a non-native word implementation * -- an opcode * * Opcodes are special xts which cannot appear as such at application * level; they have meaning only to the interpreter. * * The xt for a non-native word implementation is simply the code address: * thet is the address of the first code cell. A non-native code is a * list of consecutive cells, each containing an xt. Therefore, all such * xts are even values. * * xt for native words and opcodes have odd values. For native words, * (xt >> 1) is the native word index in a table (defined below). Opcodes * use special values (when converted to "cell", opcodes assume negative * values). */ /* * Opcodes. * * OP_EXIT takes no argument. When encountered, the current word is exited. * * OP_JMP takes one argument, which is a displacement value. The new ip * value is the sum of the argument address and the argument value * (hence, if the argument is equal to CELLSIZE, this opcode is a nop). * * OP_IF takes one argument, with the same semantics than OP_JMP. However, * OP_IF pops one value from the data stack, and performs the jump if * and only if the popped value is 0. Otherwise, execution continues just * after the OP_IF argument cell. * * OP_LIT takes one argument, which is pushed on the data stack. * * OP_ALIT takes one argument and jumps like OP_JMP; moreover, it pushes * on the data stack the address of the cell immediately following its * argument cell. OP_ALIT is used for embedding static strings (such as * those obtained with S" ). * * OP_BODY takes one argument, which is an xt. It first pushes on the * data stack the address of the cell immediately following its argument * cell, and then _jumps_ to the xt-designated code address. Note that * this is not a word call (with ip pushed on the returned stack) but a * full branch. Newly CREATEd words begin with an OP_BODY and a argument * of 0, which points to the cell 0 where an opcode OP_EXIT is stored. * DOES> updates the OP_BODY argument with a new xt. OP_BODY is never * encountered except as the very first opcode of CREATEd words. * * OP_EXECUTE takes no argument. It pops one xt from the data stack, * and executes the corresponding word. */ #define OP_EXIT -1 /* exit from the current word */ #define OP_JMP -3 /* unconditionnal branch */ #define OP_IF -5 /* conditionnal branch (consumes a flag from stack) */ #define OP_LIT -7 /* embedded literal value */ #define OP_ALIT -9 /* combined embedded value and branch */ #define OP_BODY -11 /* initiator for CREATEd words */ #define OP_EXECUTE -13 /* used for EXECUTE */ /* * Defined further (native word invocation). */ static void native_exec(ucell cfa); /* * Execute the provided xt, which may be either native or not (but it * must not be an opcode). */ static void interp(ucell ip) { cell *saved_rp; if (ip & 1) { native_exec(ip >> 1); return; } saved_rp = return_stack_ptr; for (;;) { ucell xt = fetch(ip); ip += CELLSIZE; if (xt & 1) { cell op = (cell)xt; if (op >= 0) { native_exec(xt >> 1); } else switch (op) { ucell arg; case OP_EXIT: if (return_stack_ptr >= saved_rp) return; ip = pop_ret(); break; case OP_JMP: arg = fetch(ip); ip += arg; break; case OP_IF: arg = fetch(ip); if (pop()) ip += CELLSIZE; else ip += arg; break; case OP_LIT: arg = fetch(ip); push(arg); ip += CELLSIZE; break; case OP_ALIT: push(ip + CELLSIZE); arg = fetch(ip); ip += arg; break; case OP_BODY: push(ip + CELLSIZE); arg = fetch(ip); ip = arg; break; case OP_EXECUTE: arg = pop(); if (arg & 1) { native_exec(arg >> 1); } else { push_ret(ip); ip = arg; } break; default: panic("invalid opcode: " XCELL, xt); } } else { push_ret(ip); ip = xt; } } } /* * Create a word header, with the provided name. Return value is the * newly created header address. The word is not yet linked in the * current wordlist. * * CFA-I is initialized to 0. Since the cell at address 0 contains the * constant OP_EXIT value, the default word interpretation semantics * are to do nothing at all. * * CFA-C is initialized to 0. This means default compilation semantics. */ static ucell create_header(unsigned char *name, size_t name_len) { ucell addr, len; unsigned char tmp[WH_NAME], *buf; size_t tmp_len = sizeof tmp; /* * We arbitrarily limit the name length (to an utterly absurd * value) in order to avoid some overflow problems. */ if (name_len > (CELLMAX / 4)) panic("word name too long"); /* * We store the name beginning in a temporary buffer in order * to handle word creation when the header itself overwrites * the name buffer. I'm not sure such a situation is unambiguously * allowed by ANS Forth anyway. */ if (tmp_len > name_len) tmp_len = name_len; memcpy(tmp, name, tmp_len); align(); addr = FETCH(FA_HERE); len = WH_NAME + align_address(name_len); allot(len); STORE(addr + WH_NAMELEN, name_len); STORE(addr + WH_CFA_I, 0); STORE(addr + WH_CFA_C, 0); STORE(addr + WH_LFA, 0); buf = native_address(addr + WH_NAME); if (name_len > sizeof tmp) memmove(buf + sizeof tmp, name + sizeof tmp, name_len - sizeof tmp); memcpy(buf, tmp, tmp_len); #ifdef DEBUG fprintf(stderr, "created header for {"); fwrite(name, 1, name_len, stderr); fprintf(stderr, "} at " XCELL "\n", addr); #endif return addr; } /* * Wordlists. * * A wordlist is just a structure indexing a collection of words. It * acts as a hash table; the WL_BUCKET_NUM macro contains the number * of "buckets" in the table. All words in the same bucket are organized * as a linked list through their respective LFA fields (the address 0 * is the list terminator). * * Wordlists contain also a pointer to the next list in the current * search order. * * Wordlist memory layout: * pointer to next wordlist 1 cell * pointer to words WL_BUCKET_NUM cells */ #define WL_BUCKET_NUM 32 /* should be a power of 2 */ /* * Get the canonical value for a character; this converts lowercase ASCII * characters to uppercase. */ static unsigned ccanon(unsigned c) { if (c >= 97 && c <= 122) return c - 32; else return c; } /* * Compare two strings (ASCII case-insensitive). Return value is 1 if * both strings are equal, 0 otherwise. */ static int eqname(unsigned char *b1, unsigned char *b2, size_t len) { while ((len --) > 0) { unsigned c1 = ccanon(*b1 ++), c2 = ccanon(*b2 ++); if (c1 != c2) return 0; } return 1; } /* * Hash a string into an 11-bit value. This function is ASCII * case-insensitive. The hash function is remotely inspired from * the standard ELF hash function. */ static unsigned hash_string(unsigned char *b, size_t len) { unsigned h = 0; while ((len --) > 0) { unsigned g; unsigned c = ccanon(*b ++); h = (h << 5) + (0xFF & ~c); g = h & 0xF800U; h ^= g >> 11; h &= ~g; } return h; } /* * Find a word in a wordlist. Returned value is either the word header * address, or 0 if the word was not found. The "bucket" value should * be computed from the hashed name. * * Note: Malevolent tinkering with the dictionary area could fool this * function into an infinite loop. */ static ucell lookup_word(ucell wordlist, unsigned char *name, size_t name_len, unsigned bucket) { ucell ptr; ptr = fetch(wordlist + CELLS(1 + bucket)); while (ptr != 0) { ucell len = fetch(ptr + WH_NAMELEN); if (len == name_len) { ucell nfa = ptr + WH_NAME; check_range(nfa, len); if (eqname(name, native_address(nfa), len)) return ptr; } ptr = fetch(ptr + WH_LFA); } return 0; } /* * Find a word by name, trying all wordlists in the current search * order. Return value is the word header address, or 0 if the word * is not found. */ static ucell find_word(ucell name, ucell name_len) { ucell wl = FETCH(FA_CURRENT_SEARCH); unsigned char *rname; unsigned bucket; check_range(name, name_len); rname = native_address(name); bucket = hash_string(rname, name_len) % WL_BUCKET_NUM; while (wl != 0) { ucell wa = lookup_word(wl, rname, name_len, bucket); if (wa != 0) return wa; wl = fetch(wl); } return 0; } /* * Link the provided word into a wordlist. */ static void link_word(ucell wordlist, ucell wa) { ucell nfa, len, tmp; unsigned u; len = fetch(wa + WH_NAMELEN); nfa = wa + WH_NAME; check_range(nfa, len); u = hash_string(native_address(nfa), len) % WL_BUCKET_NUM; tmp = wordlist + CELLS(1 + u); store(wa + WH_LFA, fetch(tmp)); store(tmp, wa); #ifdef DEBUG fprintf(stderr, "linking " XCELL ": {", wa); logword(nfa, len); fprintf(stderr, "}\n"); #endif } /* * String to integer conversion. The characters pointed to by "str", * and up to "*len" characters, are processed according to the provided * base: for each character which is valid (relatively to the base), * "orig" is multiplied by "base" and the character value is added to * "orig". * * Processing stops at the end of the string or at the first invalid * character. The function returns the value of "orig" finally reached. * When the function returns, "*len" contains the number of unprocessed * characters. * * "base" must have a value between 2 and 36 inclusive (this is not * verified by this function). ASCII letters A to Z (or a to z -- this * function is not case-sensitive) correspond to values 10 to 35. */ static ucell to_number(ucell orig, unsigned char *str, size_t *len, int base) { while (*len > 0) { int c = *str ++; if (c >= 48 && c <= 57) c -= 48; else if (c >= 65 && c <= 90) c -= 55; else if (c >= 97 && c <= 122) c -= 87; else break; if (c >= base) break; (*len) --; orig = orig * (ucell)base + (ucell)c; } return orig; } /* * "Compile" a cell: a new cell is allocated and the provided value is * stored in that cell. The cell address is returned. */ static ucell compile(cell value) { ucell addr = FETCH(FA_HERE); allot(CELLSIZE); store(addr, value); return addr; } /* ======================================================================= */ /* * I/O code. * * For each open file accessible from the Forth code, we keep a C "FILE *" * pointer. A file identifier is just an index in that table. We allow * for a maximum of 256 simultaneously open files (but the underlying OS * may restrict us further). */ #define OPENFILE_MAX 256 /* * This array represents the open files; free array entries contain NULL. */ static FILE *file[OPENFILE_MAX]; /* * Initialize the file[] array. */ static void file_init(void) { int i; for (i = 0; i < OPENFILE_MAX; i ++) file[i] = NULL; } /* * Open a file. Return value is -1 if the file cannot be opened. * Note: in standard C, we cannot open a file for just writing to * it (_not_ appending) without truncating it. For such a combination, * we open the file for both reading and writing. */ #define FF_RO 0x0001U /* read-only access */ #define FF_WO 0x0002U /* write-only access */ #define FF_RW (FF_RO | FF_WO) /* read-write access */ #define FF_BIN 0x0004U /* binary access (useless in most Unix systems) */ #define FF_CTR 0x0008U /* create if absent, truncate if present */ static int file_open(unsigned char *name, size_t name_len, unsigned flags) { char tmp[FILENAME_MAX]; int i; char am[10] = ""; int mw, mr; /* * We must copy the file name to a temporary buffer because * the C library requires a terminating 0. */ if (name_len > (sizeof tmp) - 1) return -1; memcpy(tmp, name, name_len); tmp[name_len] = 0; /* * We reject attempts at opening file names with embedded 0. */ if (strlen(tmp) != name_len) return -1; /* * We find the first free file[] array entry. */ for (i = 0; i < OPENFILE_MAX; i ++) if (file[i] == NULL) break; if (i == OPENFILE_MAX) return -1; mr = (flags & FF_RO) != 0; mw = (flags & FF_WO) != 0; if (flags & FF_CTR) { if (mr && mw) strcpy(am, "w+"); else if (mw) strcpy(am, "w"); else if (mr) strcpy(am, "r"); else return -1; } else { strcpy(am, mw ? "r+" : "r"); } if (flags & FF_BIN) strcat(am, "b"); file[i] = fopen(tmp, am); if (file[i] == NULL) return -1; else return i; } /* * Close a file. Return value is 0 on success, -1 on error. This * function handles correctly (i.e., by returning an error, not by * crashing) spurious closes on unopened files. */ static int file_close(int i) { int r; if (i < 0 || i >= OPENFILE_MAX) return -1; if (file[i] == NULL) return -1; r = fclose(file[i]); file[i] = NULL; return r == 0 ? 0 : -1; } /* * Read a line. The buffer provided should have length at least "len + 1". * * Return value: * -3 invalid file ID * -2 I/O error * -1 EOF reached, nothing read * n >= 0 line length, not counting the terminator * * If the returned value is equal to "len", then the terminator was not * reached. */ static long file_readline(int f, unsigned char *buf, size_t len) { size_t n; if (f < 0 || f >= OPENFILE_MAX || file[f] == NULL) return -3; if (fgets(buf, len + 1, file[f]) == NULL) return ferror(file[f]) ? -2 : -1; n = strlen(buf); if (n > 0 && buf[n - 1] == '\n') buf[-- n] = 0; return (long)n; } /* ======================================================================= */ /* * Native words. Each word is implemented as a C function which takes * no argument and returns nothing. * * Some of those words could easily have been implemented in pure Forth * just by using the other native words, but were coded as native words * for optimization or aesthetic reasons. */ /* @ */ static void native_fetch(void) { push(fetch(pop())); } /* C@ */ static void native_fetch8(void) { push(fetch8(pop())); } /* ! */ static void native_store(void) { ucell addr = pop(); store(addr, pop()); } /* C! */ static void native_store8(void) { ucell addr = pop(); store8(addr, pop()); } /* >R */ static void native_toR(void) { push_ret(pop()); } /* R> */ static void native_Rfrom(void) { push(pop_ret()); } /* R@ */ static void native_Rfetch(void) { push(look_ret(0)); } /* + */ static void native_plus(void) { cell b = pop(); cell a = pop(); push(a + b); } /* - */ static void native_minus(void) { cell b = pop(); cell a = pop(); push(a - b); } /* * */ static void native_star(void) { ucell b = pop(); ucell a = pop(); push(a * b); } /* /MOD */ static void native_slashmod(void) { cell d = pop(); cell m = pop(); if (d == 0) panic("division by 0"); #if CELLMAX == -(1 + CELLMIN) /* * When two's complement is used with no trap representation, then * a signed division of CELLMIN by -1 yields an integer overflow * (-CELLMIN is strictly greater than CELLMAX). Some processors * (e.g., i386+) react to such an occurrence with an exception, * just like they would do for a division by 0. Hence this test, * which avoids an ugly crash. */ if (m == CELLMIN && d == -1) panic("overflow in division by -1"); #endif push(m % d); push(m / d); } /* U/MOD */ /* * This word is not part of ANS Forth. Stack diagram: * u1 u2 -- u3 u4 * where u4 is the quotient of u1 by u2 and u3 is the remainder. */ static void native_uslashmod(void) { ucell d = pop(); ucell m = pop(); if (d == 0) panic("division by 0"); push(m % d); push(m / d); } /* 0< */ static void native_zeroless(void) { cell v = pop(); push(v < 0 ? (ucell)-1 : (ucell)0); } /* 0= */ static void native_zeroequals(void) { cell v = pop(); push(v == 0 ? (ucell)-1 : (ucell)0); } /* 2/ */ static void native_twoslash(void) { cell v = pop(); /* * On most architectures, right-shifting by 1 bit would work, * because duplicating the sign bit is what they do internally * and also what is most useful. But the C standard does not * mandate it, hence this test and manual bit duplication. */ if (v < 0) { ucell u = v; push((u >> 1) | ((ucell)-1 - ((ucell)-1 >> 1))); } else { push((ucell)v >> 1); } } /* < */ static void native_lessthan(void) { cell v2 = pop(); cell v1 = pop(); push(v1 < v2 ? (ucell)-1 : 0); } /* U< */ static void native_ulessthan(void) { ucell u2 = pop(); ucell u1 = pop(); push(u1 < u2 ? (ucell)-1 : 0); } /* = */ static void native_equals(void) { cell v2 = pop(); cell v1 = pop(); push(v1 == v2 ? (ucell)-1 : 0); } /* ALLOT */ static void native_allot(void) { allot(pop()); } /* AND */ static void native_and(void) { ucell u2 = pop(); ucell u1 = pop(); push(u1 & u2); } /* OR */ static void native_or(void) { ucell u2 = pop(); ucell u1 = pop(); push(u1 | u2); } /* XOR */ static void native_xor(void) { ucell u2 = pop(); ucell u1 = pop(); push(u1 ^ u2); } /* INVERT */ static void native_invert(void) { push(~(ucell)pop()); } /* CELL+ */ static void native_cellplus(void) { push(pop() + CELLSIZE); } /* CELLS */ static void native_cells(void) { push(pop() * CELLSIZE); } /* FILL */ static void native_fill(void) { cell v = (ucell)pop() & 0xFF; ucell cc = pop(); ucell addr = pop(); if (cc == 0) return; check_range(addr, cc); memset(native_address(addr), v, cc); } /* MOVE */ static void native_move(void) { ucell cc = pop(); ucell dst = pop(); ucell src = pop(); if (cc == 0) return; check_range(src, cc); check_range(dst, cc); memmove(native_address(dst), native_address(src), cc); } /* LSHIFT */ static void native_lshift(void) { ucell cc = pop(); push((ucell)pop() << cc); } /* RSHIFT */ static void native_rshift(void) { ucell cc = pop(); push((ucell)pop() >> cc); } /* DROP */ static void native_drop(void) { (void)pop(); } /* PARSE */ static void native_parse(void) { unsigned c = pop() & 0xFF; ucell source = FETCH(FA_SOURCE_PTR); ucell len = FETCH(FA_SOURCE_LEN); ucell ptr = FETCH(FA_TOIN); ucell addr; check_range(source, len); addr = source + ptr; while (ptr < len) { unsigned d = *(unsigned char *)native_address(source + ptr); /* * If the delimiter is 0x20 (SPC), we consider all "control" * characters to be delimiters. This shall match tabulations, * carriage returns and so on. ANS Forth allows this, and * this is the most useful behaviour. */ if (c == 0x20 && d <= 0x20) break; if (c == d) break; ptr ++; } push(addr); push(source + ptr - addr); if (ptr < len) ptr ++; STORE(FA_TOIN, ptr); } /* PARSE-WORD */ /* * PARSE-WORD is not part of the ANS Forth stricto sensu, but it is cited * there, and also common and useful. PARSE-WORD skips leading blanks, * then parses a blank-delimited word and returns it as "c-addr u". * If the word is not found, the returned length is 0 and the returned * address has no meaning (this implementation returns 0 as address). */ static void native_parseword(void) { ucell source = FETCH(FA_SOURCE_PTR); ucell len = FETCH(FA_SOURCE_LEN); ucell ptr = FETCH(FA_TOIN); check_range(source, len); while (ptr < len) { unsigned d = *(unsigned char *)native_address(source + ptr); if (d > 32) break; ptr ++; } if (ptr > len) ptr = len; STORE(FA_TOIN, ptr); if (ptr >= len) { push(0); push(0); } else { push(0x20); native_parse(); } } /* FIND-WORD */ /* * FIND-WORD is equivalent to FIND except that it uses a "c-addr u" * string format. Stack diagram: ( c-addr u -- c-addr u 0 | xt 1 | xt -1 ) * This word is not part of ANS Forth, but it is defined in Gforth. */ static void native_findword(void) { ucell len = pop(); ucell addr = pop(); ucell wa; wa = find_word(addr, len); if (wa != 0) { ucell xti = fetch(wa + WH_CFA_I); ucell xtc = fetch(wa + WH_CFA_C); push(xti); push(xtc == 0 ? -1 : 1); } else { push(addr); push(len); push(0); } } /* FIND-WORD-HEADER */ /* * This word is not part of ANS Forth. Its stack diagram is the following: * c-addr u -- 0 | addr -1 * It returns 0 if the word is not found; otherwise, it returns the word * address header, followed by -1. */ static void native_findheader(void) { ucell len = pop(); ucell addr = pop(); ucell wa; wa = find_word(addr, len); if (wa != 0) { push(wa); push(-1); } else { push(0); } } /* REFILL */ static void native_refill(void) { /* * The source buffer is located at the end of the dictionary * space, just below the two stacks. When we read a line, we * use as much buffer as we need, but we may have to give up * if the line is too long. The buffer size is repreatedly * doubled, until end-of-line is reached, or an error happens. */ int f = FETCH(FA_SOURCE_ID); int cr = 0; ucell lim = FETCH(FA_HERE) + PAD_LEN; size_t len = MIN_LINE_LEN + 1, old_len = 0; /* * If the current source is a string (negative source ID), we * just bail out immediately, as mandated by ANS Forth. */ if (f < 0) { push(0); return; } /* * "len" contains the current buffer (total) length. * "old_len" contains the previous buffer total length. */ for (;;) { long rl; unsigned char *buf; size_t tlen, plen; if (len > RESERVED_BEGIN || lim >= (RESERVED_BEGIN - len)) panic("not enough room for REFILL"); buf = native_address(RESERVED_BEGIN - len); if (old_len > 0) { unsigned char *old_buf; old_buf = native_address(RESERVED_BEGIN - old_len); memmove(buf, old_buf, old_len); plen = old_len - 1; buf += plen; } else { plen = 0; } tlen = len - plen; rl = file_readline(f, buf, tlen - 1); if (rl < -1) panic("I/O error (%ld) while reading input stream", rl); if (rl == -1) rl = 0; else cr = 1; if ((size_t)rl + 1 == tlen) { old_len = len; len = 2 * len; continue; } if (cr) { STORE(FA_SOURCE_PTR, RESERVED_BEGIN - len); STORE(FA_SOURCE_LEN, (size_t)rl + plen); STORE(FA_TOIN, 0); push(-1); } else { push(0); } return; } } /* CREATE */ static void native_create(void) { ucell addr, len, wa, cfa; native_parseword(); len = pop(); if (len == 0) panic("CREATE used with a zero-length name"); addr = pop(); check_range(addr, len); wa = create_header(native_address(addr), len); cfa = compile(OP_BODY); store(wa + WH_CFA_I, cfa); compile(0); link_word(FETCH(FA_CURRENT_LIST), wa); STORE(FA_LAST_WORD, wa); } /* , */ static void native_comma(void) { compile(pop()); } /* : */ static void native_colon(void) { ucell addr, len, wa, xt; native_parseword(); len = pop(); if (len == 0) panic(": used with a zero-length name"); addr = pop(); check_range(addr, len); #ifdef DEBUG fprintf(stderr, "defining new word: {"); logword(addr, len); fprintf(stderr, "}\n"); #endif wa = create_header(native_address(addr), len); xt = FETCH(FA_HERE); STORE(wa + WH_CFA_I, xt); /* * Reference this word xt as the current word (for RECURSE). * Note that this is the xt, not the word header address. */ STORE(FA_CURRENT_XT, xt); /* * Go to compilation state. */ STORE(FA_STATE, -1); /* * Push the word address on the data stack. This is used by ; when * ending the word definition. */ push(wa); #ifdef DEBUG fprintf(stderr, "leaving colon-sys, depth = " UCELL "\n", data_stack_depth()); #endif } /* ;NO-TERMINATOR */ /* * This word is not part of ANS Forth. It acts like a plain ; except that * it does not compile an OP_EXIT opcode into the current definition. * This is used when you define a word with externally supplied * interpretation or compilation semantics. */ static void native_semicolonNT(void) { ucell cs = pop(); if (cs == 0) { push(FETCH(FA_CURRENT_XT)); } else { link_word(FETCH(FA_CURRENT_LIST), cs); STORE(FA_LAST_WORD, cs); } STORE(FA_STATE, 0); } /* ; */ static void native_semicolon(void) { #ifdef DEBUG fprintf(stderr, "end of word definition, depth = " UCELL "\n", data_stack_depth()); #endif compile(OP_EXIT); native_semicolonNT(); } /* ( */ static void native_lpar(void) { for (;;) { ucell addr, len, tin; addr = FETCH(FA_SOURCE_PTR); len = FETCH(FA_SOURCE_LEN); tin = FETCH(FA_TOIN); check_range(addr, len); if (tin < len) { size_t u; unsigned char *buf = native_address(addr); for (u = tin; u < len; u ++) { if (buf[u] == ')') { STORE(FA_TOIN, u + 1); return; } } } native_refill(); if (pop() == 0) return; } } /* ABORT */ /* * FIXME: ABORT must not actually exit the system, but simply flush the * stacks and get back to interpreter mode. */ static void native_abort(void) { panic("ABORT"); } /* EMIT */ static void native_emit(void) { ucell c = pop(); putchar(c & 0xFF); } /* TYPE */ static void native_type(void) { ucell len = pop(); ucell addr = pop(); check_range(addr, len); fwrite(native_address(addr), 1, len, stdout); } /* >NUMBER */ static void native_tonumber(void) { ucell len = pop(); ucell addr = pop(); ucell orig = pop(); cell base = FETCH(FA_BASE); size_t nlen; if (base < 2 || base > 36) panic("invalid base: " DCELL, base); check_range(addr, len); nlen = len; orig = to_number(orig, native_address(addr), &nlen, base); push(orig); push(addr + (len - nlen)); push(nlen); } #if 0 /* obsolete */ /* . */ static void native_dot(void) { printf(" " DCELL, pop()); } /* U. */ static void native_udot(void) { printf(" " UCELL, (ucell)pop()); } #endif /* DEPTH */ static void native_depth(void) { push(data_stack_depth()); } /* J */ static void native_J(void) { push(look_ret(2)); } /* #! */ /* * This word is not part of ANS Forth. It acts like \ as a comment * introducer; it is used for writing Forth source code as a Unix script. */ static void native_sharpbang(void) { STORE(FA_TOIN, FETCH(FA_SOURCE_LEN)); } /* * This table holds pointers to the native word definitions. For each * word, we have the word name (as a C string) and the word interpretation * semantics (a function pointer). If the name is NULL, then the provided * function is actually the (special) compilation semantics for the * previous word. * * The name is only used at startup, for building the native word table. */ static struct { char *name; void (*fun)(void); } nwords[] = { { "@", native_fetch }, { "C@", native_fetch8 }, { "!", native_store }, { "C!", native_store8 }, { ">R", native_toR }, { "R>", native_Rfrom }, { "R@", native_Rfetch }, { "SWAP", swap }, { "DROP", native_drop }, { "DUP", dup }, { "ROT", rot }, { "-ROT", notrot }, { "OVER", over }, { "ROLL", roll }, { "PICK", pick }, { "+", native_plus }, { "-", native_minus }, { "*", native_star }, { "/MOD", native_slashmod }, { "U/MOD", native_uslashmod }, { "0<", native_zeroless }, { "0=", native_zeroequals }, { "2/", native_twoslash }, { "<", native_lessthan }, { "=", native_equals }, { "U<", native_ulessthan }, { "ALIGN", align }, { "ALLOT", native_allot }, { "AND", native_and }, { "OR", native_or }, { "XOR", native_xor }, { "INVERT", native_invert }, { "CELL+", native_cellplus }, { "CELLS", native_cells }, { "FILL", native_fill }, { "MOVE", native_move }, { "LSHIFT", native_lshift }, { "RSHIFT", native_rshift }, { "PARSE", native_parse }, { "PARSE-WORD", native_parseword }, { "FIND-WORD", native_findword }, { "FIND-WORD-HEADER", native_findheader }, { "REFILL", native_refill }, { "CREATE", native_create }, { ":", native_colon }, { ";", native_semicolon }, { 0, native_semicolon }, { ";NO-TERMINATOR", native_semicolonNT }, { 0, native_semicolonNT }, { "(", native_lpar }, { 0, native_lpar }, { ",", native_comma }, { "ABORT", native_abort }, { "EMIT", native_emit }, { "TYPE", native_type }, { ">NUMBER", native_tonumber }, #if 0 /* obsolete */ { ".", native_dot }, { "U.", native_udot }, #endif { "DEPTH", native_depth }, { "J", native_J }, { "#!", native_sharpbang }, }; static void native_exec(ucell cfa) { if (cfa > (sizeof nwords) / (sizeof nwords[0])) panic("illegal native word call: " UCELL, cfa); nwords[cfa].fun(); } /* * Pre-defined constants. Those constants are accessible as Forth words * and are used to communicate some important data to the Forth-implemented * system words (e.g., reserved memory layout). */ static struct { char *name; ucell value; } nconsts[] = { { "BASE", FA_BASE }, { "HERE-PTR", FA_HERE }, { "CURRENT-XT", FA_CURRENT_XT }, { "LAST-WORD", FA_LAST_WORD }, { "SOURCE-PTR", FA_SOURCE_PTR }, { "SOURCE-LEN-PTR", FA_SOURCE_LEN }, { "SOURCE-ID-PTR", FA_SOURCE_ID }, { ">IN", FA_TOIN }, { "STATE", FA_STATE }, { "CURRENT-LIST-PTR", FA_CURRENT_LIST }, { "CURRENT-SEARCH-PTR", FA_CURRENT_SEARCH }, { "DATA-STACK-BASE", DATA_STACK_END }, { "RETURN-STACK-BASE", RETURN_STACK_END }, { "CELLSIZE", CELLSIZE }, { "WORDLIST-BUCKET#", WL_BUCKET_NUM }, { "WH-NAMELEN", WH_NAMELEN }, { "WH-CFA-I", WH_CFA_I }, { "WH-CFA-C", WH_CFA_C }, { "WH-LFA", WH_LFA }, { "WH-NAME", WH_NAME }, { "OPCODE-EXIT", OP_EXIT }, { "OPCODE-JMP", OP_JMP }, { "OPCODE-IF", OP_IF }, { "OPCODE-LIT", OP_LIT }, { "OPCODE-ALIT", OP_ALIT }, { "OPCODE-BODY", OP_BODY }, { "OPCODE-EXECUTE", OP_EXECUTE }, }; /* * Create a constant. We define constants as words whose code consists in * only an OP_LIT opcode followed by OP_EXIT. */ static void create_constant(ucell wl, char *name, ucell value) { ucell wa = create_header(name, strlen(name)); STORE(wa + WH_CFA_I, FETCH(FA_HERE)); compile(OP_LIT); compile(value); compile(OP_EXIT); link_word(wl, wa); /* * Note: we do not register those constants in the * "current word" and "last word" fields. */ } /* * Toplevel interpreter/compiler. */ static void toplevel(void) { #ifdef DEBUG fprintf(stderr, "entering toplevel, depth = " UCELL "\n", data_stack_depth()); #endif for (;;) { ucell wa; ucell addr, len; for (;;) { native_parseword(); if (look(0) != 0) break; /* * No word found: we need to refill the buffer. * If the refill failed, then we are at the end * of that input source, and we exit. */ pop(); pop(); native_refill(); if (!pop()) return; } len = pop(); addr = pop(); /* * Now we look for the word. */ wa = find_word(addr, len); if (wa == 0) { /* * The word was not found. We try to use the * word as a number. */ int fc; fc = fetch8(addr); if (fc == 0x2D) { /* * 0x2D is ASCII '-' */ addr ++; len --; if (len == 0) panic("word '-' is unknown !"); fc = 1; } else { fc = 0; } push(0); push(addr); push(len); native_tonumber(); if (pop() != 0) { /* * Conversion was not complete. This is not * a number. */ fprintf(stderr, "unkown word: "); fwrite(native_address(addr), 1, len, stderr); fprintf(stderr, "\n"); panic("unknown word"); } pop(); if (fc) push(-pop()); if (FETCH(FA_STATE)) { /* * If we are in compilation state, we * compile the number (left on the stack) * into the current definition. */ compile(OP_LIT); native_comma(); } continue; } if (FETCH(FA_STATE)) { ucell xt = fetch(wa + WH_CFA_C); if (xt == 0) compile(fetch(wa + WH_CFA_I)); else interp(xt); } else { interp(fetch(wa + WH_CFA_I)); } } } /* * Main program. Command-line arguments are Forth source file names; those * files are processed successively. When all files have been processed, * the interactive device (stdin) is used. */ int main(int argc, char *argv[]) { int i; unsigned u; ucell wl, wa = 0; /* * Initialize stack pointers. */ data_stack_ptr = (cell *)&dictionary.b[DATA_STACK_END]; return_stack_ptr = (cell *)&dictionary.b[RETURN_STACK_END]; /* * Initialize file access table. */ file_init(); /* * Initialize memory. */ STORE(0, OP_EXIT); STORE(FA_BASE, 10); STORE(FA_HERE, FA_EORA); STORE(FA_CURRENT_XT, 0); STORE(FA_LAST_WORD, 0); STORE(FA_SOURCE_ID, 0); STORE(FA_SOURCE_PTR, RESERVED_BEGIN); STORE(FA_SOURCE_LEN, 0); STORE(FA_TOIN, 0); STORE(FA_STATE, 0); /* * Create the initial word list. */ wl = FETCH(FA_HERE); allot(CELLS(1 + WL_BUCKET_NUM)); STORE(wl, 0); for (u = 0; u < WL_BUCKET_NUM; u ++) STORE(wl + CELLS(1 + u), 0); STORE(FA_CURRENT_LIST, wl); STORE(FA_CURRENT_SEARCH, wl); /* * Create native words. */ for (u = 0; u < (sizeof nwords) / (sizeof nwords[0]); u ++) { if (nwords[u].name == 0) { store(wa + WH_CFA_C, (u << 1) + 1); } else { wa = create_header(nwords[u].name, strlen(nwords[u].name)); store(wa + WH_CFA_I, (u << 1) + 1); link_word(wl, wa); } } /* * Create constants. */ for (u = 0; u < (sizeof nconsts) / (sizeof nconsts[0]); u ++) create_constant(wl, nconsts[u].name, nconsts[u].value); /* * Process input files. */ file[0] = stdin; for (i = 1; i < argc; i ++) { int f = file_open(argv[i], strlen(argv[i]), FF_RO); if (f < 0) panic("file not found: %s\n", argv[i]); STORE(FA_SOURCE_ID, f); toplevel(); file_close(f); } STORE(FA_SOURCE_ID, 0); toplevel(); return 0; }