/****************************************************************************
*****************************************************************************
FILE : soundex.c
FUNCTION : This module contains the two algorithms SOUNDEX and PHONIX as
they were defined by Gadd.
NOTES : This is an ANSI-C version of the original C++ file.
LITERATURE: T.N. Gadd: 'Fishing fore Werds': Phonetic Retrieval of written
text in Information Retrieval Systems, Program 22/3, 1988,
p.222-237.
T.N. Gadd: PHONIX --- The Algorithm, Program 24/4, 1990,
p.363-366.
*****************************************************************************
****************************************************************************/
/* #define TEST */ /* activates procedures main() and PrintCode() */
/* #define PHONIX_DEBUG */ /* activates some debug information */
/****************************************************************************
NAME : StrDel
INPUT : char *DelPos --- pointer to first char to be deleted
int DelSize --- number of chars to be deleted
OUTPUT : char *DelPos
FUNCTION: This procedure deletes DelSize chars at position DelPos and moves
the remaining chars left to DelPos.
EXAMPLE : If Del is pointing at the L of the string "DELETE" the call
StrDel(Del, 2) will return Del pointing at "TE".
****************************************************************************/
#ifdef __cplusplus
extern "C" {
#endif
#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
#ifdef __cplusplus
}
#endif
void StrDel (DelPos, DelSize)
char *DelPos;
int DelSize;
{
/* move chars left */
char *Help = DelPos + DelSize;
while (*Help)
*DelPos++ = *Help++;
/* move trailing \0 */
*DelPos = *Help;
}
/****************************************************************************
NAME : StrIns
INPUT : char *InsPos --- pointer to insert position
OUTPUT : char *InStr --- new string to be inserted
FUNCTION: StrIns moves the chars at position InsPos right and copies the
string InsStr into this free space.
EXAMPLE : If Ins is pointing at the S of the string "INSERT" the call
StrIns(Ins, "NEW") will return Ins pointing at "NEWSERT".
****************************************************************************/
void StrIns (InsPos, InsStr)
char *InsPos;
char *InsStr;
{
int i;
int MoveSize = strlen(InsStr);
/* move chars right */
for (i = strlen(InsPos)+1; i >= 0; i--)
InsPos[i+MoveSize] = InsPos[i];
/* copy InsStr to InsPos */
while (*InsStr)
*InsPos++ = *InsStr++;
}
extern bool IsVowel();
#if 0
/****************************************************************************
NAME : IsVowel
INPUT : char c --- char to be examined
OUTPUT : int --- 1 or 0
FUNCTION: IsVowel checks if c is an uppercase vowel or an uppercase Y. If c
is one of those chars IsVowel will return a 1, else it will return
a 0.
****************************************************************************/
int IsVowel (c)
unsigned char c;
{
return (c == 'A') || (c == 'E') || (c == 'I') ||
(c == 'O') || (c == 'U') || (c == 'Y') ||
(c == 0304) || (c == 0344) || (c == 0334) ||
(c == 0366) || (c == 0326) || (c == 0374);
}
/****************************************************************************
NAME : SoundexCode
INPUT : char *Name --- string to be calculated
OUTPUT : char *Key --- soundex code for Name
FUNCTION: This procedure calculates a four-letter soundex code for the string
Name and returns this code in the string Key.
****************************************************************************/
#define SoundexLen 4 /* length of a soundex code */
#define SoundexKey "Z000" /* default key for soundex code */
char SCode(c)
unsigned char c;
{
/* set letter values */
static int Code[] = {0, 1, 2, 3, 0, 1, 2, 0, 0, 2, 2, 4, 5, 5, 0,
1, 2, 6, 2, 3, 0, 1, 0, 2, 0, 2};
fprintf(stderr, "SCode(%c)\n", c);
if (c == 0337) return(2); /* german sz */
return(Code[toupper(c)-'A']);
}
void SoundexCode (Name, Key)
unsigned char *Name;
unsigned char *Key;
{
unsigned char LastLetter;
int Index;
fprintf(stderr, "SoundexCode(%s)\n", Name);
/* set default key */
strcpy(Key, SoundexKey);
/* keep first letter */
Key[0] = *Name;
LastLetter = *Name;
Name++;
/* scan rest of string */
for (Index = 1; (Index < SoundexLen) && *Name; Name++)
{
/* use only letters */
if (isalpha(*Name))
{
/* ignore duplicate successive chars */
if (LastLetter != *Name)
{
/* new LastLetter */
LastLetter = *Name;
/* ignore letters with code 0 */
if (!IsVowel(*Name) && (SCode(*Name) != 0))
{
Key[Index] = '0' + SCode(*Name);
Index++;
}
}
}
}
}
#endif /* 0 */
/****************************************************************************
NAME : PhonixCode
INPUT : char *Name --- string to be calculated
OUTPUT : char *Key --- phonix code for Name
FUNCTION: This procedure calculates a eight-letter phonix code for the string
Name and returns this code in the string Key.
****************************************************************************/
#define PhonixLen 8 /* length of a phonix code */
#define PhonixKey "Z0000000" /* default key for phonix code */
extern bool IsAlpha();
extern char PCode();
void PhonixCode (Name, Key)
char *Name;
char *Key;
{
char LastLetter;
int Index;
/* set default key */
strcpy(Key, PhonixKey);
/* keep first letter or replace it with '$' */
Key[0] = IsVowel(*Name) ? '$' : *Name;
LastLetter = *Name;
Name++;
/* NOTE: Gadd replaces vowels being the first letter of the */
/* word with a 'v'. Due to the implementation of WAIS all */
/* letters will be lowercased. Therefore '$' is used instead */
/* of 'v'. */
/* scan rest of string */
for (Index = 1; (Index < PhonixLen) && *Name; Name++)
{
/* use only letters */
if (IsAlpha(*Name))
{
/* ignore duplicate successive chars */
if (LastLetter != *Name)
{
LastLetter = *Name;
/* ignore letters with code 0 except as separators */
if (PCode(*Name) != '0')
{
Key[Index] = PCode(*Name);
Index++;
}
}
}
}
}
/****************************************************************************
NAME : PhonixReplace1
INPUT : int where --- replace OldStr only if it occurs at this position
char *Name --- string to work
char *OldStr --- old letter group to delete
char *NewStr --- new letter group to insert
int CondPre --- condition referring to letter before OldStr
int CondPost --- condition referring to letter after OldStr
OUTPUT : char *Name --- Name with replaced letter group
FUNCTION: This procedure replaces the letter group OldStr with the letter
group NewStr in the string Name, regarding the position of OldStr
where (START, MIDDLE, END, ALL) and the conditions CondPre and
CondPost (NON, VOC, CON).
EXAMPLE : PhonixReplace1(START, "WAWA", "W", "V", NON, NON) replaces only the
first W with a V because of the condition START.
EXAMPLE : PhonixReplace1(START, "WAWA", "W", "V", NON, CON) replaces neither
the first W with a V (because of the condition CON, i.e. a consonant
must follow the W) nor the second W (because of the condition START).
****************************************************************************/
#define NON 1 /* no condition */
#define VOC 2 /* vowel needed */
#define CON 3 /* consonant needed */
#define START 1 /* condition refers to beginning of Name */
#define MIDDLE 2 /* condition refers to middle of Name */
#define END 3 /* condition refers to EndPos of Name */
#define ALL 4 /* condition refers to whole Name */
void PhonixReplace1 (Where, Name, OldStr, NewStr, CondPre, CondPost)
int Where;
char *Name;
char *OldStr;
char *NewStr;
int CondPre;
int CondPost;
{
char *OldStrPos;
char *EndPos;
char *NamePtr = Name;
/* vowels before or after OldStr */
char LetterPre; /* letter before OldStr */
char LetterPost; /* letter after OldStr */
int VowelPre; /* LetterPre is vowel? */
int VowelPost; /* LetterPost is vowel? */
int OkayPre; /* pre-condition okay? */
int OkayPost; /* post-condition okay? */
do
{
/* find OldStr in NamePtr */
OldStrPos = strstr(NamePtr, OldStr);
/* find EndPos of Name */
EndPos = &Name[strlen(Name)-strlen(OldStr)];
/* check conditions if OldStrPos != NULL */
if (OldStrPos)
{
/* vowel before OldStrPos */
LetterPre = *(OldStrPos-1);
/* vowel after OldStrPos+strlen(OldStr) */
LetterPost = *(OldStrPos+strlen(OldStr));
/* check conditions */
switch (CondPre)
{
case NON: OkayPre = 1;
break;
case VOC: OkayPre = LetterPre ? IsVowel(LetterPre) : 0;
break;
case CON: OkayPre = LetterPre ? !IsVowel(LetterPre) : 0;
break;
default : OkayPre = 0;
break;
}
switch (CondPost)
{
case NON: OkayPost = 1;
break;
case VOC: OkayPost = LetterPost ? IsVowel(LetterPost) : 0;
break;
case CON: OkayPost = LetterPost ? !IsVowel(LetterPost) : 0;
break;
default : OkayPost = 0;
break;
}
}
/* replace OldStr with NewStr */
if (OldStrPos && OkayPre && OkayPost &&
((Where == START) && (OldStrPos == Name) ||
(Where == MIDDLE) && (OldStrPos != Name) && (OldStrPos != EndPos) ||
(Where == END) && (OldStrPos == EndPos) ||
(Where == ALL)))
{
/* replace old letter group with new letter group */
StrDel(OldStrPos, strlen(OldStr));
StrIns(OldStrPos, NewStr);
/* advance NamePtr to the position of OldStr */
NamePtr = OldStrPos;
#ifdef PHONIX_DEBUG
printf("Replace = %s-->%s\n", OldStr, NewStr);
#endif /* PHONIX_DEBUG */
}
else
/* advance NamePtr one char */
NamePtr++;
}
while (OldStrPos);
}
/****************************************************************************
NAME : PhonixReplace2
INPUT : int where --- replace OldStr only if it occurs at this position
char *Name --- string to work
char *OldStr --- old letter group to delete
char *NewStr --- new letter group to insert
OUTPUT : char *Name --- Name with replaced letter group
FUNCTION: This procedure replaces the letter group OldStr with the letter
group NewStr in the string Name, regarding the position of OldStr
where (START, MIDDLE, END, ALL).
EXAMPLE : PhonixReplace2(START, "WAWA", "W", "V") replaces only the first W
with a V because of the condition START.
****************************************************************************/
void PhonixReplace2 (Where, Name, OldStr, NewStr)
int Where;
char *Name;
char *OldStr;
char *NewStr;
{
char *OldStrPos;
char *EndPos;
char *NamePtr = Name;
do
{
/* find OldStr in NamePtr */
OldStrPos = strstr(NamePtr, OldStr);
/* find EndPos of Name */
EndPos = &Name[strlen(Name)-strlen(OldStr)];
/* replace OldStr with NewStr */
if (OldStrPos &&
((Where == START) && (OldStrPos == Name) ||
(Where == MIDDLE) && (OldStrPos != Name) && (OldStrPos != EndPos) ||
(Where == END) && (OldStrPos == EndPos) ||
(Where == ALL)))
{
/* replace old letter group with new letter group */
StrDel(OldStrPos, strlen(OldStr));
StrIns(OldStrPos, NewStr);
/* advance NamePtr to the position of OldStr */
NamePtr = OldStrPos;
#ifdef PHONIX_DEBUG
printf("Replace = %s-->%s\n", OldStr, NewStr);
#endif /* PHONIX_DEBUG */
}
else
/* advance NamePtr one char */
NamePtr++;
}
while (OldStrPos);
}
/****************************************************************************
NAME : Phonix
INPUT : char *Name --- string to calculate phonix code for
OUTPUT : char *Key --- phonix code of Name
FUNCTION: Phonix calculates the phonix code for the string Name.
****************************************************************************/
void Phonix (Name, Key)
char *Name;
char *Key;
{
/* use new variable NewName to remain Name unchanged */
char NewName[50];
int i;
strcpy(NewName, Name);
/* uppercase NewName */
for (i=0; i < strlen(NewName); i++)
if (islower(NewName[i]))
NewName[i] = toupper(NewName[i]);
#ifdef PHONIX_DEBUG
printf("Name = %s\n", NewName);
#endif /* PHONIX_DEBUG */
/* replace letter groups according to Gadd's definition */
PhonixReplace2(ALL , NewName, "DG" , "G" );
PhonixReplace2(ALL , NewName, "CO" , "KO" );
PhonixReplace2(ALL , NewName, "CA" , "KA" );
PhonixReplace2(ALL , NewName, "CU" , "KU" );
PhonixReplace2(ALL , NewName, "CY" , "SI" );
PhonixReplace2(ALL , NewName, "CI" , "SI" );
PhonixReplace2(ALL , NewName, "CE" , "SE" );
PhonixReplace1(START , NewName, "CL" , "KL" , NON, VOC);
PhonixReplace2(ALL , NewName, "CK" , "K" );
PhonixReplace2(END , NewName, "GC" , "K" );
PhonixReplace2(END , NewName, "JC" , "K" );
PhonixReplace1(START , NewName, "CHR" , "KR" , NON, VOC);
PhonixReplace1(START , NewName, "CR" , "KR" , NON, VOC);
PhonixReplace2(START , NewName, "WR" , "R" );
PhonixReplace2(ALL , NewName, "NC" , "NK" );
PhonixReplace2(ALL , NewName, "CT" , "KT" );
PhonixReplace2(ALL , NewName, "PH" , "F" );
PhonixReplace2(ALL , NewName, "AA" , "AR" );
PhonixReplace2(ALL , NewName, "SCH" , "SH" );
PhonixReplace2(ALL , NewName, "BTL" , "TL" );
PhonixReplace2(ALL , NewName, "GHT" , "T" );
PhonixReplace2(ALL , NewName, "AUGH" , "ARF" );
PhonixReplace1(MIDDLE, NewName, "LJ" , "LD" , VOC, VOC);
PhonixReplace2(ALL , NewName, "LOUGH", "LOW" );
PhonixReplace2(START , NewName, "Q" , "KW" );
PhonixReplace2(START , NewName, "KN" , "N" );
PhonixReplace2(END , NewName, "GN" , "N" );
PhonixReplace2(ALL , NewName, "GHN" , "N" );
PhonixReplace2(END , NewName, "GNE" , "N" );
PhonixReplace2(ALL , NewName, "GHNE" , "NE" );
PhonixReplace2(END , NewName, "GNES" , "NS" );
PhonixReplace2(START , NewName, "GN" , "N" );
PhonixReplace1(MIDDLE, NewName, "GN" , "N" , NON, CON);
PhonixReplace1(END , NewName, "GN" , "N" , NON, NON); /* NON,CON */
PhonixReplace2(START , NewName, "PS" , "S" );
PhonixReplace2(START , NewName, "PT" , "T" );
PhonixReplace2(START , NewName, "CZ" , "C" );
PhonixReplace1(MIDDLE, NewName, "WZ" , "Z" , VOC, NON);
PhonixReplace2(MIDDLE, NewName, "CZ" , "CH" );
PhonixReplace2(ALL , NewName, "LZ" , "LSH" );
PhonixReplace2(ALL , NewName, "RZ" , "RSH" );
PhonixReplace1(MIDDLE, NewName, "Z" , "S" , NON, VOC);
PhonixReplace2(ALL , NewName, "ZZ" , "TS" );
PhonixReplace1(MIDDLE, NewName, "Z" , "TS" , CON, NON);
PhonixReplace2(ALL , NewName, "HROUG", "REW" );
PhonixReplace2(ALL , NewName, "OUGH" , "OF" );
PhonixReplace1(MIDDLE, NewName, "Q" , "KW" , VOC, VOC);
PhonixReplace1(MIDDLE, NewName, "J" , "Y" , VOC, VOC);
PhonixReplace1(START , NewName, "YJ" , "Y" , NON, VOC);
PhonixReplace2(START , NewName, "GH" , "G" );
PhonixReplace1(END , NewName, "E" , "GH" , VOC, NON);
PhonixReplace2(START , NewName, "CY" , "S" );
PhonixReplace2(ALL , NewName, "NX" , "NKS" );
PhonixReplace2(START , NewName, "PF" , "F" );
PhonixReplace2(END , NewName, "DT" , "T" );
PhonixReplace2(END , NewName, "TL" , "TIL" );
PhonixReplace2(END , NewName, "DL" , "DIL" );
PhonixReplace2(ALL , NewName, "YTH" , "ITH" );
PhonixReplace1(START , NewName, "TJ" , "CH" , NON, VOC);
PhonixReplace1(START , NewName, "TSJ" , "CH" , NON, VOC);
PhonixReplace1(START , NewName, "TS" , "T" , NON, VOC);
PhonixReplace1(ALL , NewName, "TCH" , "CH" );
PhonixReplace1(MIDDLE, NewName, "WSK" , "VSKIE", VOC, NON);
PhonixReplace1(END , NewName, "WSK" , "VSKIE", VOC, NON);
PhonixReplace1(START , NewName, "MN" , "N" , NON, VOC);
PhonixReplace1(START , NewName, "PN" , "N" , NON, VOC);
PhonixReplace1(MIDDLE, NewName, "STL" , "SL" , VOC, NON);
PhonixReplace1(END , NewName, "STL" , "SL" , VOC, NON);
PhonixReplace2(END , NewName, "TNT" , "ENT" );
PhonixReplace2(END , NewName, "EAUX" , "OH" );
PhonixReplace2(ALL , NewName, "EXCI" , "ECS" );
PhonixReplace2(ALL , NewName, "X" , "ECS" );
PhonixReplace2(END , NewName, "NED" , "ND" );
PhonixReplace2(ALL , NewName, "JR" , "DR" );
PhonixReplace2(END , NewName, "EE" , "EA" );
PhonixReplace2(ALL , NewName, "ZS" , "S" );
PhonixReplace1(MIDDLE, NewName, "R" , "AH" , VOC, CON);
PhonixReplace1(END , NewName, "R" , "AH" , VOC, NON); /* VOC,CON */
PhonixReplace1(MIDDLE, NewName, "HR" , "AH" , VOC, CON);
PhonixReplace1(END , NewName, "HR" , "AH" , VOC, NON); /* VOC,CON */
PhonixReplace1(END , NewName, "HR" , "AH" , VOC, NON);
PhonixReplace2(END , NewName, "RE" , "AR" );
PhonixReplace1(END , NewName, "R" , "AH" , VOC, NON);
PhonixReplace2(ALL , NewName, "LLE" , "LE" );
PhonixReplace1(END , NewName, "LE" , "ILE" , CON, NON);
PhonixReplace1(END , NewName, "LES" , "ILES" , CON, NON);
PhonixReplace2(END , NewName, "E" , "" );
PhonixReplace2(END , NewName, "ES" , "S" );
PhonixReplace1(END , NewName, "SS" , "AS" , VOC, NON);
PhonixReplace1(END , NewName, "MB" , "M" , VOC, NON);
PhonixReplace2(ALL , NewName, "MPTS" , "MPS" );
PhonixReplace2(ALL , NewName, "MPS" , "MS" );
PhonixReplace2(ALL , NewName, "MPT" , "MT" );
/* calculate Key for NewName */
PhonixCode(NewName, Key);
#ifdef PHONIX_DEBUG
printf("NewName = %s\n", NewName);
printf("Code = %s\n\n", Key);
#endif /* PHONIX_DEBUG */
}
/****************************************************************************
NAME : Soundex
INPUT : char *Name --- string to calculate soundex code for
OUTPUT : char *Key --- soundex code of Name
FUNCTION: Soundex calculates the soundex code for the string Name.
****************************************************************************/
void Soundex (Name, Key)
char *Name;
char *Key;
{
/* use new variable NewName to remain Name unchanged */
char NewName[50];
int i;
strcpy(NewName, Name);
/* uppercase NewName */
for (i=0; i < strlen(NewName); i++)
if (islower(NewName[i]))
NewName[i] = toupper(NewName[i]);
/* calculate Key for Name */
SoundexCode(NewName, Key);
/* fprintf(stderr, "Soundex: %s -> %s\n", Name, Key); */
}
/****************************************************************************
Now the two procedures PrintCode() and main() follow which will only be
included if TEST is defined.
****************************************************************************/
#ifdef TEST
void PrintCode (Name)
unsigned char *Name;
{
unsigned char SoundexName[SoundexLen+1];
unsigned char PhonixName[PhonixLen+1];
Soundex(Name, SoundexName);
Phonix(Name, PhonixName);
printf("%20s --> %s %s\n", Name, SoundexName, PhonixName);
}
void main ()
{
unsigned char s[256];
PrintCode("CLASSEN");
PrintCode("WRITE");
PrintCode("WRIGHT");
PrintCode("RITE");
PrintCode("WHITE");
PrintCode("WAIT");
PrintCode("WEIGHT");
PrintCode("KNIGHT");
PrintCode("NIGHT");
PrintCode("NITE");
PrintCode("GNOME");
PrintCode("NOAM");
PrintCode("SMIDT");
PrintCode("SMITH");
PrintCode("SCHMIT");
PrintCode("CRAFT");
PrintCode("KRAFT");
PrintCode("REES");
PrintCode("REECE");
PrintCode("YAEGER");
PrintCode("YOGA");
PrintCode("EAGER");
PrintCode("AUGER");
PrintCode("Krueger");
PrintCode("Kruger");
PrintCode("Krüger");
while (1) {
PrintCode(gets(s));
}
}
#endif /* TEST*/