UNIT dictionary; {-*-Mode: fundamental-mode}

{ This unit can create, clear and dispose dictionaries. They can be
  filled with symbols or any strings which are assigned to numerical
  keys. Applications simply deal with short keys (ShortWord) as place
  holders for symbols/strings. Several dictionaries can be opened and
  used in parallel.

  Access to dictionaries is done via handles. Transforming a string
  to it's key goes via hash table and is very fast. The oposite way
  (key to string) currently is sequential and hence slow but could be
  drastically speeded up without interface modification at the expense
  of 2 more bytes per symbol/string.
  
  Symbol creation and error signalling can be controlled with the mode
  parameter which tells assumptions at calling time: the symbol/string
  must be new (create) or must already exist (exist) or don't care
  (unknown). Function return value of true in this sense means everything
  is ok. So we need very few routines to cover several aspects of usage.
  The current version accepts but ignores the case_sens parameter (case
  sensitivity is active).

  Strings are stored with good memory utilization. Drawback: There is no
  delete function ;-(. In all of my applications there was no need for
  that function. Realizing a delete function would not be simple and needed
  some general declaration what we really want (do we need a use count for
  symbols? etc.).

  Author: Ernst-Ludwig Bohnen }

INTERFACE
const
  dic_max_sym  = 255;                               {maximum symbol or string length}

type
  dic_handle_t = pointer;                           {dictionary handle}
  dic_sym      = string (dic_max_sym);              {symbol string}
  dic_mode     = (create, exist, unknown);          {what we expect in dictionary}
  dic_indx     = ShortWord;                         {pos number assigned to a symbol}

  {            Actions and boolean function return values () of dic_sym2indx
  -----------I------------------------------------------------I-----------------------------
  mode       I symbol/string doesn't exist                    I symbol/string exists already
  -----------I------------------------------------------------I-----------------------------
  create     I create symbol, return new index      (true)    I return its index  (false)
  exist      I do not create symbol, return index=0 (false)   I return its index  (true)
  unknown    I create symbol, return new index      (true)    I return its index  (true)
  -----------I------------------------------------------------I-----------------------------}

procedure dic_new    (var handle : dic_handle_t; case_sens: boolean);    {create dictionary from scratch}
procedure dic_reset      (handle : dic_handle_t);                        {remove all entries, keep alive}
procedure dic_dispose    (handle : dic_handle_t);                        {deallocate all memory}
function  dic_sym2indx   (handle : dic_handle_t; sym  : dic_sym;  var index : dic_indx; mode: dic_mode): boolean;
function  dic_indx2sym   (handle : dic_handle_t; index: dic_indx; var symbol: dic_sym)                 : boolean;
function  dic_symbol     (handle : dic_handle_t; index: dic_indx): dic_sym;{simple version of dic_indx2sym}
function  dic_last_indx  (handle : dic_handle_t): dic_indx;              {number of stored symbols/strings}

procedure dic_read       (handle : dic_handle_t; file_name: string);     {Load existing dictionary from disk}
procedure dic_write      (handle : dic_handle_t; file_name: string);     {Store dictionary to disk}
procedure dic_diagnostic (handle : dic_handle_t);                        {print hash efficiency statistics}


{----------------------------------------------------------------------------------------}
IMPLEMENTATION
uses base;

const
  dic_max_data = 2000;                              {data chunk size}
  dic_hash_lob = 7;                                 {number of significant low order bits of hash table index}
{ dic_hash_max = 2 pow dic_hash_lob -1 ;}           {used also as mask to index} 
  dic_hash_max = 127;                               {compiler version 19991030 spinnt ?!}
                                                    {set dic_hash_max manually to 2 ** dic_hash_lob -1}
  dic_version  = 'dic_ver1';                        {read files only if written with the same dic_version}

type
  dic_data_p   = ^dic_data;                         {data block}
  dic_data     = record
                   nxtblk   : dic_data_p;           {pointer to next data block, simple list}
                   blk      : string (dic_max_data);{takes many smaller strings/symbols whithout holes}
                 end;

  dic_w_p      = ^dic_w;                            {symbol control block}
  dic_w        = record 
                   nxtw     : dic_w_p;              {link to next symbol}
                   hh       : ShortWord;            {High order part of hash code, speeds up symbol search}
                   len      : ShortWord;            {char length of symbol}
                   indx     : dic_indx;             {Index: 1.. 2 ** 16 -1 }
                   offs     : ShortWord;            {first ch of symbol in blk[]}
                   blk_ptr  : dic_data_p;           {points to the block where symbol is stored}
                 end;

  dic_handle   = ^dic_record;
  dic_record   = record
                   root_blk : dic_data_p;           {pointer to data chain}
                   last_indx: dic_indx;             {last given index, or number of stored symbols}
                   cs       : boolean;              {true if case sensive}
                   hash_tab : array [0 .. dic_hash_max] of dic_w_p
                 end;

procedure dic_new    (var handle : dic_handle_t; case_sens: boolean);
var i: integer; p: dic_handle;
BEGIN 
  new (p); handle:=p;                               {return new handle to caller}
  with p^ do
  begin
    root_blk  := nil;                               {start without any data blocks and sym control blocks}
    last_indx := 0;                                 {init index}
    cs        := case_sens;                         {remember case sensivity}
    for i:= 0 to dic_hash_max do hash_tab [i]:= nil
  end
END;

procedure dic_reset      (handle : dic_handle_t);
var
  pp     : dic_handle;                              {make the object 'empty'}
  d      : dic_data_p;                              {but do not change the handle}
  p, p1  : dic_w_p;                                 {application may have spred out}
  i      : Integer;                                 {several copies of the handle}
begin
pp:= handle;
with pp^ do
  begin
    while  root_blk <>nil do                        {remove all data blocks}
      begin
        d:= root_blk;
        root_blk:= d^.nxtblk;
        dispose (d)
      end;
    for i:= 0 to dic_hash_max do                    {remove all symbol blocks}
      begin
        p:= hash_tab [i];
        while p<>nil do begin p1:= p; p:= p^.nxtw; dispose (p1) end;
        hash_tab [i] := nil
      end;
    last_indx := 0
  end
end;

procedure hash_code (s: dic_sym; var hoh, loh: ShortInt);  {internal use only}
var
  w : Word;
  i : Integer;
begin
  w:= length (s);
  for i:= 1 to length (s) do w:= (w shl 1) xor ord (s[i]) xor ((w shr 6) and 4); {try CRC code}
  loh:= w and dic_hash_max;                         {for low order part mask out the high order bits}
  hoh:= w shr dic_hash_lob                          {for high order part shift out low order bits}
end;

procedure dic_dispose     (handle : dic_handle_t);
var
  pp : dic_handle;
begin
  pp:= handle;
{ dic_diagnostic (pp); } {out comment this line and recompile to see hash efficiency of hash code}
  dic_reset (pp);
  dispose   (pp)
end;

function  dic_sym2indx   (handle : dic_handle_t; sym : dic_sym;  var index: dic_indx; mode: dic_mode): boolean;
var
  pp        : dic_handle;
  hoh, loh  : ShortInt;
  p         : dic_w_p;
  d         : dic_data_p;
begin
  pp           := handle;
  dic_sym2indx := true;
  hash_code (sym, hoh, loh);
  with pp^ do
    begin
      p:= hash_tab [loh];
      while p<>nil do with p^ do
        begin
          if (hoh = hh) then if (length (sym) = len) then if sym = substr (blk_ptr^.blk, offs, length (sym)) then
             begin
               index:= indx;
               break
             end;
          p:= nxtw
        end;
      if p<>nil
         then if mode=create then dic_sym2indx:= false else     {Object exists}
         else if mode=exist  then dic_sym2indx:= false else     {Object not found}
           begin                                                {if mode= create or unknown create Object}
             d:= root_blk;
             while d<>nil do with d^ do                         {search a block of sufficient size}
               begin
                 if length (sym) <= (dic_max_data - length (blk)) then break;
                 d:= nxtblk
               end;
             if d=nil then                                      {there is no block with sufficient space}
               begin                                            {get new data block, link}
                 if length (sym) > dic_max_data then abort      ('Exotic symbol length');
                 new (d);
                 d^.nxtblk:= root_blk;
                 d^.blk:= ''; root_blk:= d                      {link to top of chain}
               end;  
             new (p);
             with p^ do
               begin                                            {fill the stuff into p^}
                 nxtw      := hash_tab [loh]; hash_tab[loh]:= p;{get new sym block, link}
                 hh        := hoh;                              {high order part of hash code}
                 len       := length (sym);                     {length of symbol}
                 if last_indx = 16#FFFF then halt (99);         {max index is 2 pow 16 -1, can't increment }
                 last_indx := last_indx+1;                      {should be checked for > 2**16}
                 index     := last_indx;                        {answer to calling routine}
                 indx      := last_indx; 
                 offs      := length (d^.blk) +1 ;              {not clear +1 ???}
                 blk_ptr   := d;
                 d^.blk    := d^.blk + sym;                     {append new symbol at end}
               end;
           end
    end
end;

function  dic_symbol     (handle : dic_handle_t; index: dic_indx): dic_sym;
var
  pp: dic_handle;
  p : dic_w_p;
  i : Integer;
begin
 pp := handle;
 with pp^ do
   begin
     for i:= 0 to dic_hash_max do                   {go through all hash entries}
       begin
         p:= hash_tab [i];                          {get head of linked list, may be nil}
         while p<>nil do with p^ do                 {and all linked sym control blocks}
           begin
             if indx=index then break;              {if index is found end the loops}
             p:= nxtw                               {step to next block}
           end;
         if p<>nil then break                       {if index is found p points to sym control block}
       end;
     if p<>nil then with p^ do dic_symbol:= substr (blk_ptr^.blk, offs, len) else dic_symbol:='UNKNOWN'
   end
end;

function  dic_indx2sym     (handle : dic_handle_t; index: dic_indx; var symbol: dic_sym): boolean;
var
  pp: dic_handle;
begin
 pp := handle;
 with pp^ do if (index <= last_indx) and (index > 0)
   then
     begin
       symbol:= dic_symbol (handle, index);
       dic_indx2sym:= true
     end
   else dic_indx2sym:= false
end;

function  dic_last_indx  (handle : dic_handle_t): dic_indx;
var p: dic_handle;
begin p:= handle; dic_last_indx:= p^.last_indx end;

procedure dic_read       (handle : dic_handle_t; file_name: string);
var
  p    : dic_handle;                                {format of dictionary files:}
  f    : file of char;                              {version control string followed by LF}
  ch   : char;                                      {byte n    : high order part of length L}
  L, i : integer;                                   {byte n+1  : low order part of length L}
  s    : dic_sym;                                   {byte n+2  : first byte of symbol}
  index: dic_indx;                                  {byte ..   :     }
  ok   : boolean;                                   {byte n+1+L: Lth (last) byte of symbol}
begin                                               {next high-low-string sequence}
p := handle;
with p^ do
  begin
    assign (f, file_name);
    reset (f);
    ch:= chr (0); s:= '';
    while (not eof (f)) and (length (s) <= dic_max_sym) do
      begin
        read (f, ch);                               {read version control word}
        if ch<>chr (10) then s:= s + ch else break  {LF delimits version control}
      end;
    if (ch<>chr (10)) or (s <> dic_version) then abort ('Dictionary Version mismatch');
    while not eof (f) do                            {wrong data format causes runtime error:}
      begin                                         {attempt to read past end of file .. (error #454)}
        read (f, ch); L := ord (ch) shl 8;          {first get the high order part of length}
        read (f, ch); L := L or ord (ch);           {then merge the low order part}
        s:= '';
        for i:= 1 to L do                           {just read byte count L must tell the truth}
             begin read (f, ch); s:= s + ch end;    {read L bytes into s, no check of eof (f) !!!}
        ok:= dic_sym2indx(handle, s, index, unknown){and add to dictionary}
      end;
    close (f)
  end
end;

procedure dic_write      (handle : dic_handle_t; file_name: string);
 var
  p    : dic_handle;
  f    : file of char;
  i, j : Integer;
  s    : dic_sym;
begin
p := handle;
with p^ do
  begin
    assign (f, file_name);
    rewrite (f);
    for j:= 1 to length (dic_version) do write (f, dic_version [j]);
    write (f, chr (10));                            {version control string terminated with LF}
    for i:= 1 to last_indx do
      begin
        s:= dic_symbol (handle, i);
        write (f, chr (length (s) shr 8),           {put first the high order part of length}
                  chr (length (s) and 16#FF));      {and then the low order part}
        for j:= 1 to length (s) do write (f, s[j])  {and now the body of symbol}
      end;
    close (f)
  end
end;

procedure dic_diagnostic  (handle : dic_handle_t);
const
  ppl   =  8;          {pairs per line}
  occ   = 10;          {upper array bound (occurances)}
  fw    =  5;          {field width}
var
  pp    : dic_handle;
  i, n  : integer;
  p     : dic_w_p;
  a     : array [0..occ] of integer;

begin
  pp := handle;
  with pp^ do
  begin
    writeln ('Dictionary Statistics');
    writeln ('Number of stored Symbols =', last_indx     :7);
    writeln ('Memory chunk size (byte) =', dic_max_data  :7);
    writeln ('Hash table size          =', dic_hash_max+1:7);
    writeln;
    writeln ('Hash Table index vs number of linked entries (', ppl, ' pairs per line):');
    for i:= 0 to occ do a[i]:= 0;
    for i:= 0 to dic_hash_max do
      begin
        p:= hash_tab [i]; n:= 0;
        while p<>nil do begin n:=n+1; p:= p^.nxtw end;     {look for number of linked entries}
        if (i<>0) and ((i mod ppl) = 0) then writeln;      {show 8 hash table entry pairs per line}
        write (i:5, n:4);                                  {pair of hash index vs number of linked symbols}
        n:= min (n, occ); a [n]:= a[n] + 1                 {prepair some other view of statistics}
      end;
    writeln; writeln;
    writeln   ('Entry length of');                         {write now some other view of statistics}
    for i:= 0 to occ do write (   i:fw); writeln (' or more occured');
    for i:= 0 to occ do write (a[i]:fw); writeln (' times');
   end;
end;

END.  {dictionary implementation}
