The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
/****************************************************************************
*****************************************************************************
FILE      : soundex.c
FUNCTION  : This module contains the two algorithms SOUNDEX and PHONIX as 
            they were defined by Gadd.
NOTES     : This is an ANSI-C version of the original C++ file.
LITERATURE: T.N. Gadd: 'Fishing fore Werds': Phonetic Retrieval of written 
            text in Information Retrieval Systems, Program 22/3, 1988, 
            p.222-237.
            T.N. Gadd: PHONIX --- The Algorithm, Program 24/4, 1990, 
            p.363-366.
*****************************************************************************
****************************************************************************/

/* #define TEST */       /* activates procedures main() and PrintCode() */
/* #define PHONIX_DEBUG */      /* activates some debug information            */

/****************************************************************************
NAME    : StrDel
INPUT   : char *DelPos --- pointer to first char to be deleted
          int  DelSize --- number of chars to be deleted
OUTPUT  : char *DelPos 
FUNCTION: This procedure deletes DelSize chars at position DelPos and moves
          the remaining chars left to DelPos.
EXAMPLE : If Del is pointing at the L of the string "DELETE" the call
          StrDel(Del, 2) will return Del pointing at "TE". 
****************************************************************************/

#ifdef __cplusplus
extern "C" {
#endif
#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
#ifdef __cplusplus
}
#endif

void StrDel (DelPos, DelSize)
char *DelPos;
int  DelSize;
{
  /* move chars left */
  char *Help = DelPos + DelSize;
  while (*Help)
    *DelPos++ = *Help++;

  /* move trailing \0 */
  *DelPos = *Help;
}


/****************************************************************************
NAME    : StrIns
INPUT   : char *InsPos --- pointer to insert position
OUTPUT  : char *InStr  --- new string to be inserted
FUNCTION: StrIns moves the chars at position InsPos right and copies the
          string InsStr into this free space.
EXAMPLE : If Ins is pointing at the S of the string "INSERT" the call
          StrIns(Ins, "NEW") will return Ins pointing at "NEWSERT". 
****************************************************************************/
void StrIns (InsPos, InsStr)
char *InsPos;
char *InsStr;
{
  int i;
  int MoveSize = strlen(InsStr);

  /* move chars right */
  for (i = strlen(InsPos)+1; i >= 0; i--)
    InsPos[i+MoveSize] = InsPos[i];

  /* copy InsStr to InsPos */
  while (*InsStr)
    *InsPos++ = *InsStr++;
}

extern bool IsVowel();
#if 0
/****************************************************************************
NAME    : IsVowel
INPUT   : char c --- char to be examined
OUTPUT  : int    --- 1 or 0
FUNCTION: IsVowel checks if c is an uppercase vowel or an uppercase Y. If c
          is one of those chars IsVowel will return a 1, else it will return
          a 0.
****************************************************************************/
int IsVowel (c)
unsigned char c;
{
  return (c == 'A') || (c == 'E') || (c == 'I') ||
    (c == 'O') || (c == 'U') || (c == 'Y') ||
      (c == 0304) || (c == 0344) || (c == 0334) ||
        (c == 0366) || (c == 0326) || (c == 0374);
}


/****************************************************************************
NAME    : SoundexCode
INPUT   : char *Name --- string to be calculated
OUTPUT  : char *Key  --- soundex code for Name
FUNCTION: This procedure calculates a four-letter soundex code for the string 
          Name and returns this code in the string Key.
****************************************************************************/
#define SoundexLen 4      /* length of a soundex code */
#define SoundexKey "Z000" /* default key for soundex code */

char SCode(c)
unsigned char c;
{
  /* set letter values */
  static int  Code[] = {0, 1, 2, 3, 0, 1, 2, 0, 0, 2, 2, 4, 5, 5, 0, 
                          1, 2, 6, 2, 3, 0, 1, 0, 2, 0, 2};

  fprintf(stderr, "SCode(%c)\n", c);
  if (c == 0337) return(2); /* german sz */
  return(Code[toupper(c)-'A']);
}

void SoundexCode (Name, Key)
unsigned char *Name;
unsigned char *Key;
{
  unsigned char LastLetter;
  int  Index;

  fprintf(stderr, "SoundexCode(%s)\n", Name);
  /* set default key */
  strcpy(Key, SoundexKey);
  
  /* keep first letter */
  Key[0] = *Name;
  LastLetter = *Name;
  Name++;

  /* scan rest of string */
  for (Index = 1; (Index < SoundexLen) && *Name; Name++)
  {
    /* use only letters */
    if (isalpha(*Name))
    {
      /* ignore duplicate successive chars */
      if (LastLetter != *Name)
      {
        /* new LastLetter */
        LastLetter = *Name;

        /* ignore letters with code 0 */
        if (!IsVowel(*Name) && (SCode(*Name) != 0))
        {
          Key[Index] = '0' + SCode(*Name);
          Index++;
        }
      }
    }
  }
}
#endif /* 0 */

/****************************************************************************
NAME    : PhonixCode
INPUT   : char *Name --- string to be calculated
OUTPUT  : char *Key  --- phonix code for Name
FUNCTION: This procedure calculates a eight-letter phonix code for the string 
          Name and returns this code in the string Key.
****************************************************************************/
#define PhonixLen 8          /* length of a phonix code */
#define PhonixKey "Z0000000" /* default key for phonix code */

extern bool IsAlpha();
extern char PCode();

void PhonixCode (Name, Key)
char *Name;
char *Key;
{
  char LastLetter;
  int  Index;

  /* set default key */
  strcpy(Key, PhonixKey);

  /* keep first letter or replace it with '$' */
  Key[0] = IsVowel(*Name) ? '$' : *Name;
  LastLetter = *Name;
  Name++;

  /* NOTE: Gadd replaces vowels being the first letter of the  */
  /* word with a 'v'. Due to the implementation of WAIS all    */
  /* letters will be lowercased. Therefore '$' is used instead */
  /* of 'v'.                                                   */          

  /* scan rest of string */
  for (Index = 1; (Index < PhonixLen) && *Name; Name++)
  {
    /* use only letters */
    if (IsAlpha(*Name))
    {
      /* ignore duplicate successive chars */
      if (LastLetter != *Name)
      {
        LastLetter = *Name;

        /* ignore letters with code 0 except as separators */
        if (PCode(*Name) != '0')
        {
          Key[Index] = PCode(*Name);
          Index++;
        }
      }
    }
  }
}


/****************************************************************************
NAME    : PhonixReplace1
INPUT   : int  where    --- replace OldStr only if it occurs at this position
          char *Name    --- string to work
          char *OldStr  --- old letter group to delete
          char *NewStr  --- new letter group to insert
          int  CondPre  --- condition referring to letter before OldStr
          int  CondPost --- condition referring to letter after OldStr
OUTPUT  : char *Name    --- Name with replaced letter group
FUNCTION: This procedure replaces the letter group OldStr with the letter 
          group NewStr in the string Name, regarding the position of OldStr
          where (START, MIDDLE, END, ALL) and the conditions CondPre and 
          CondPost (NON, VOC, CON).
EXAMPLE : PhonixReplace1(START, "WAWA", "W", "V", NON, NON) replaces only the
          first W with a V because of the condition START.
EXAMPLE : PhonixReplace1(START, "WAWA", "W", "V", NON, CON) replaces neither
          the first W with a V (because of the condition CON, i.e. a consonant
          must follow the W) nor the second W (because of the condition START).
****************************************************************************/
#define NON    1  /* no condition     */
#define VOC    2  /* vowel needed     */ 
#define CON    3  /* consonant needed */

#define START  1  /* condition refers to beginning of Name */
#define MIDDLE 2  /* condition refers to middle of Name    */
#define END    3  /* condition refers to EndPos of Name    */
#define ALL    4  /* condition refers to whole Name        */

void PhonixReplace1 (Where, Name, OldStr, NewStr, CondPre, CondPost)
int  Where;
char *Name;
char *OldStr;
char *NewStr;
int  CondPre;
int  CondPost;
{
  char *OldStrPos;
  char *EndPos;
  char *NamePtr = Name;

  /* vowels before or after OldStr */
  char LetterPre;  /* letter before OldStr */
  char LetterPost; /* letter after OldStr  */
  int  VowelPre;   /* LetterPre is vowel?  */
  int  VowelPost;  /* LetterPost is vowel? */
  int  OkayPre;    /* pre-condition okay?  */
  int  OkayPost;   /* post-condition okay? */

  do
  { 
    /* find OldStr in NamePtr */
    OldStrPos = strstr(NamePtr, OldStr);

    /* find EndPos of Name */
    EndPos = &Name[strlen(Name)-strlen(OldStr)];

    /* check conditions if OldStrPos != NULL */
    if (OldStrPos)
    {
      /* vowel before OldStrPos */
      LetterPre = *(OldStrPos-1);
      /* vowel after OldStrPos+strlen(OldStr) */
      LetterPost = *(OldStrPos+strlen(OldStr));

      /* check conditions */
      switch (CondPre)
      {
        case NON: OkayPre = 1;
                  break;
        case VOC: OkayPre = LetterPre ? IsVowel(LetterPre) : 0;
                  break;
        case CON: OkayPre = LetterPre ? !IsVowel(LetterPre) : 0;
                  break;
        default : OkayPre = 0;
                  break;
      }
      switch (CondPost)
      {
        case NON: OkayPost = 1;
                  break;
        case VOC: OkayPost = LetterPost ? IsVowel(LetterPost) : 0;
                  break;
        case CON: OkayPost = LetterPost ? !IsVowel(LetterPost) : 0;
                  break;
        default : OkayPost = 0;
                  break;
      }
    }

    /* replace OldStr with NewStr */
    if (OldStrPos && OkayPre && OkayPost &&
       ((Where == START)  && (OldStrPos == Name) ||
        (Where == MIDDLE) && (OldStrPos != Name) && (OldStrPos != EndPos) ||
        (Where == END)    && (OldStrPos == EndPos)  ||
        (Where == ALL)))
    {
      /* replace old letter group with new letter group */
      StrDel(OldStrPos, strlen(OldStr));
      StrIns(OldStrPos, NewStr);

      /* advance NamePtr to the position of OldStr */
      NamePtr = OldStrPos;

#ifdef PHONIX_DEBUG
      printf("Replace = %s-->%s\n", OldStr, NewStr);  
#endif /* PHONIX_DEBUG */
    }
    else
      /* advance NamePtr one char */
      NamePtr++;
  }
  while (OldStrPos);
}


/****************************************************************************
NAME    : PhonixReplace2
INPUT   : int  where   --- replace OldStr only if it occurs at this position
          char *Name   --- string to work
          char *OldStr --- old letter group to delete
          char *NewStr --- new letter group to insert
OUTPUT  : char *Name   --- Name with replaced letter group
FUNCTION: This procedure replaces the letter group OldStr with the letter 
          group NewStr in the string Name, regarding the position of OldStr
          where (START, MIDDLE, END, ALL).
EXAMPLE : PhonixReplace2(START, "WAWA", "W", "V") replaces only the first W 
          with a V because of the condition START.
****************************************************************************/
void PhonixReplace2 (Where, Name, OldStr, NewStr)
int  Where;
char *Name;
char *OldStr;
char *NewStr;
{
  char *OldStrPos;
  char *EndPos;
  char *NamePtr = Name;

  do
  { 
    /* find OldStr in NamePtr */
    OldStrPos = strstr(NamePtr, OldStr);

    /* find EndPos of Name */
    EndPos = &Name[strlen(Name)-strlen(OldStr)];

    /* replace OldStr with NewStr */
    if (OldStrPos &&
       ((Where == START)  && (OldStrPos == Name) ||
        (Where == MIDDLE) && (OldStrPos != Name) && (OldStrPos != EndPos) ||
        (Where == END)    && (OldStrPos == EndPos)  ||
        (Where == ALL)))
    { 
      /* replace old letter group with new letter group */
      StrDel(OldStrPos, strlen(OldStr));
      StrIns(OldStrPos, NewStr);

      /* advance NamePtr to the position of OldStr */
      NamePtr = OldStrPos;

#ifdef PHONIX_DEBUG
      printf("Replace = %s-->%s\n", OldStr, NewStr);  
#endif /* PHONIX_DEBUG */
    }
    else
      /* advance NamePtr one char */
      NamePtr++;
  }
  while (OldStrPos);
}


/****************************************************************************
NAME    : Phonix
INPUT   : char *Name --- string to calculate phonix code for
OUTPUT  : char *Key  --- phonix code of Name
FUNCTION: Phonix calculates the phonix code for the string Name.
****************************************************************************/
void Phonix (Name, Key)
char *Name;
char *Key;
{
  /* use new variable NewName to remain Name unchanged */
  char NewName[50];
  int  i;

  strcpy(NewName, Name);

  /* uppercase NewName */
  for (i=0; i < strlen(NewName); i++)
    if (islower(NewName[i])) 
      NewName[i] = toupper(NewName[i]);

#ifdef PHONIX_DEBUG
  printf("Name    = %s\n", NewName);  
#endif /* PHONIX_DEBUG */

  /* replace letter groups according to Gadd's definition */
  PhonixReplace2(ALL   , NewName, "DG"   , "G"    );
  PhonixReplace2(ALL   , NewName, "CO"   , "KO"   );
  PhonixReplace2(ALL   , NewName, "CA"   , "KA"   );
  PhonixReplace2(ALL   , NewName, "CU"   , "KU"   );
  PhonixReplace2(ALL   , NewName, "CY"   , "SI"   );
  PhonixReplace2(ALL   , NewName, "CI"   , "SI"   );
  PhonixReplace2(ALL   , NewName, "CE"   , "SE"   );
  PhonixReplace1(START , NewName, "CL"   , "KL"   , NON, VOC);
  PhonixReplace2(ALL   , NewName, "CK"   , "K"    );
  PhonixReplace2(END   , NewName, "GC"   , "K"    );
  PhonixReplace2(END   , NewName, "JC"   , "K"    );
  PhonixReplace1(START , NewName, "CHR"  , "KR"   , NON, VOC);
  PhonixReplace1(START , NewName, "CR"   , "KR"   , NON, VOC);
  PhonixReplace2(START , NewName, "WR"   , "R"    );
  PhonixReplace2(ALL   , NewName, "NC"   , "NK"   );
  PhonixReplace2(ALL   , NewName, "CT"   , "KT"   );
  PhonixReplace2(ALL   , NewName, "PH"   , "F"    );
  PhonixReplace2(ALL   , NewName, "AA"   , "AR"   );
  PhonixReplace2(ALL   , NewName, "SCH"  , "SH"   );
  PhonixReplace2(ALL   , NewName, "BTL"  , "TL"   );
  PhonixReplace2(ALL   , NewName, "GHT"  , "T"    );
  PhonixReplace2(ALL   , NewName, "AUGH" , "ARF"  );
  PhonixReplace1(MIDDLE, NewName, "LJ"   , "LD"   , VOC, VOC);
  PhonixReplace2(ALL   , NewName, "LOUGH", "LOW"  );
  PhonixReplace2(START , NewName, "Q"    , "KW"   );
  PhonixReplace2(START , NewName, "KN"   , "N"    );
  PhonixReplace2(END   , NewName, "GN"   , "N"    );
  PhonixReplace2(ALL   , NewName, "GHN"  , "N"    );
  PhonixReplace2(END   , NewName, "GNE"  , "N"    );
  PhonixReplace2(ALL   , NewName, "GHNE" , "NE"   );
  PhonixReplace2(END   , NewName, "GNES" , "NS"   );
  PhonixReplace2(START , NewName, "GN"   , "N"    );
  PhonixReplace1(MIDDLE, NewName, "GN"   , "N"    , NON, CON);
  PhonixReplace1(END   , NewName, "GN"   , "N"    , NON, NON); /* NON,CON */
  PhonixReplace2(START , NewName, "PS"   , "S"    );
  PhonixReplace2(START , NewName, "PT"   , "T"    );
  PhonixReplace2(START , NewName, "CZ"   , "C"    );
  PhonixReplace1(MIDDLE, NewName, "WZ"   , "Z"    , VOC, NON);
  PhonixReplace2(MIDDLE, NewName, "CZ"   , "CH"   );
  PhonixReplace2(ALL   , NewName, "LZ"   , "LSH"  );
  PhonixReplace2(ALL   , NewName, "RZ"   , "RSH"  );
  PhonixReplace1(MIDDLE, NewName, "Z"    , "S"    , NON, VOC);
  PhonixReplace2(ALL   , NewName, "ZZ"   , "TS"   );
  PhonixReplace1(MIDDLE, NewName, "Z"    , "TS"   , CON, NON);
  PhonixReplace2(ALL   , NewName, "HROUG", "REW"  );
  PhonixReplace2(ALL   , NewName, "OUGH" , "OF"   );
  PhonixReplace1(MIDDLE, NewName, "Q"    , "KW"   , VOC, VOC);
  PhonixReplace1(MIDDLE, NewName, "J"    , "Y"    , VOC, VOC);
  PhonixReplace1(START , NewName, "YJ"   , "Y"    , NON, VOC);
  PhonixReplace2(START , NewName, "GH"   , "G"    );
  PhonixReplace1(END   , NewName, "E"    , "GH"   , VOC, NON);
  PhonixReplace2(START , NewName, "CY"   , "S"    );
  PhonixReplace2(ALL   , NewName, "NX"   , "NKS"  );
  PhonixReplace2(START , NewName, "PF"   , "F"    );
  PhonixReplace2(END   , NewName, "DT"   , "T"    );
  PhonixReplace2(END   , NewName, "TL"   , "TIL"  );
  PhonixReplace2(END   , NewName, "DL"   , "DIL"  );
  PhonixReplace2(ALL   , NewName, "YTH"  , "ITH"  );
  PhonixReplace1(START , NewName, "TJ"   , "CH"   , NON, VOC);
  PhonixReplace1(START , NewName, "TSJ"  , "CH"   , NON, VOC);
  PhonixReplace1(START , NewName, "TS"   , "T"    , NON, VOC);
  PhonixReplace1(ALL   , NewName, "TCH"  , "CH"   );
  PhonixReplace1(MIDDLE, NewName, "WSK"  , "VSKIE", VOC, NON);
  PhonixReplace1(END   , NewName, "WSK"  , "VSKIE", VOC, NON);
  PhonixReplace1(START , NewName, "MN"   , "N"    , NON, VOC);
  PhonixReplace1(START , NewName, "PN"   , "N"    , NON, VOC);
  PhonixReplace1(MIDDLE, NewName, "STL"  , "SL"   , VOC, NON);
  PhonixReplace1(END   , NewName, "STL"  , "SL"   , VOC, NON);
  PhonixReplace2(END   , NewName, "TNT"  , "ENT"  );
  PhonixReplace2(END   , NewName, "EAUX" , "OH"   );
  PhonixReplace2(ALL   , NewName, "EXCI" , "ECS"  );
  PhonixReplace2(ALL   , NewName, "X"    , "ECS"  );
  PhonixReplace2(END   , NewName, "NED"  , "ND"   );
  PhonixReplace2(ALL   , NewName, "JR"   , "DR"   );
  PhonixReplace2(END   , NewName, "EE"   , "EA"   );
  PhonixReplace2(ALL   , NewName, "ZS"   , "S"    );
  PhonixReplace1(MIDDLE, NewName, "R"    , "AH"   , VOC, CON);
  PhonixReplace1(END   , NewName, "R"    , "AH"   , VOC, NON); /* VOC,CON */
  PhonixReplace1(MIDDLE, NewName, "HR"   , "AH"   , VOC, CON);
  PhonixReplace1(END   , NewName, "HR"   , "AH"   , VOC, NON); /* VOC,CON */
  PhonixReplace1(END   , NewName, "HR"   , "AH"   , VOC, NON);
  PhonixReplace2(END   , NewName, "RE"   , "AR"   );
  PhonixReplace1(END   , NewName, "R"    , "AH"   , VOC, NON);
  PhonixReplace2(ALL   , NewName, "LLE"  , "LE"   );
  PhonixReplace1(END   , NewName, "LE"   , "ILE"  , CON, NON);
  PhonixReplace1(END   , NewName, "LES"  , "ILES" , CON, NON);
  PhonixReplace2(END   , NewName, "E"    , ""     );
  PhonixReplace2(END   , NewName, "ES"   , "S"    );
  PhonixReplace1(END   , NewName, "SS"  , "AS"    , VOC, NON);
  PhonixReplace1(END   , NewName, "MB"  , "M"     , VOC, NON);
  PhonixReplace2(ALL   , NewName, "MPTS" , "MPS"  );
  PhonixReplace2(ALL   , NewName, "MPS"  , "MS"   );
  PhonixReplace2(ALL   , NewName, "MPT"  , "MT"   );

  /* calculate Key for NewName */
  PhonixCode(NewName, Key);

#ifdef PHONIX_DEBUG
  printf("NewName = %s\n", NewName);  
  printf("Code    = %s\n\n", Key);  
#endif /* PHONIX_DEBUG */
}


/****************************************************************************
NAME    : Soundex
INPUT   : char *Name --- string to calculate soundex code for
OUTPUT  : char *Key  --- soundex code of Name
FUNCTION: Soundex calculates the soundex code for the string Name.
****************************************************************************/

void Soundex (Name, Key)
char *Name;
char *Key;
{
  /* use new variable NewName to remain Name unchanged */
  char NewName[50];
  int  i; 

  strcpy(NewName, Name);

  /* uppercase NewName */
  for (i=0; i < strlen(NewName); i++)
    if (islower(NewName[i])) 
      NewName[i] = toupper(NewName[i]);

  /* calculate Key for Name */
  SoundexCode(NewName, Key);
  /* fprintf(stderr, "Soundex: %s -> %s\n", Name, Key); */
}


/****************************************************************************
Now the two procedures PrintCode() and main() follow which will only be
included if TEST is defined.
****************************************************************************/

#ifdef TEST

void PrintCode (Name)
unsigned char *Name;
{
  unsigned char SoundexName[SoundexLen+1];
  unsigned char PhonixName[PhonixLen+1];

  Soundex(Name, SoundexName);
  Phonix(Name, PhonixName);
  printf("%20s --> %s %s\n", Name, SoundexName, PhonixName);
}


void main ()
{
  unsigned char s[256];
  PrintCode("CLASSEN");
  PrintCode("WRITE");
  PrintCode("WRIGHT");
  PrintCode("RITE");
  PrintCode("WHITE");
  PrintCode("WAIT");
  PrintCode("WEIGHT");
  PrintCode("KNIGHT");
  PrintCode("NIGHT");
  PrintCode("NITE");
  PrintCode("GNOME");
  PrintCode("NOAM");
  PrintCode("SMIDT");
  PrintCode("SMITH");
  PrintCode("SCHMIT");
  PrintCode("CRAFT");
  PrintCode("KRAFT");
  PrintCode("REES");
  PrintCode("REECE");
  PrintCode("YAEGER");
  PrintCode("YOGA");
  PrintCode("EAGER");
  PrintCode("AUGER");
  PrintCode("Krueger");
  PrintCode("Kruger");
  PrintCode("Krüger");
  while (1) {
    PrintCode(gets(s));
  }
}

#endif /* TEST*/