willichan Posted October 11, 2010 Posted October 11, 2010 (edited) Here are some functions I use for doing phonetic comparisons of words. They can be useful for sorting, spell checking, etc...PhoneticAlgorithms.au3expandcollapse popup#include-once ;;;; All of these functions are case-insensitive ; +------------------+ ; | Public Functions | ; +------------------+ Func _SoundexNARA($sSoundexString, $iSoundexLength=4) ; soundex as prescribed by the U.S. National Archives & Records Administration Local $iLoop, $sLookup Local $sReturnValue $sSoundexString = StringUpper(StringStripWS($sSoundexString, 8)) $sReturnValue = StringLeft($sSoundexString, 1) For $iLoop = 2 To StringLen($sSoundexString) $sLookup = _SoundexLookup(StringMid($sSoundexString, $iLoop, 1)) If $sLookup = _SoundexLookup(StringMid($sSoundexString, $iLoop - 1, 1)) Then ContinueLoop If $sLookup < 0 Then ContinueLoop If $iLoop > 2 Then If _SoundexLookup(StringMid($sSoundexString, $iLoop - 2, 1)) > 0 Then If _SoundexLookup(StringMid($sSoundexString, $iLoop - 1, 1)) = -2 Then ContinueLoop EndIf EndIf $sReturnValue &= $sLookup If StringLen($sReturnValue) >= $iSoundexLength Then ExitLoop Next While StringLen($sReturnValue) < $iSoundexLength $sReturnValue &= "0" WEnd Return $sReturnValue EndFunc Func _SoundexNum($sSoundexString, $iSoundexLength=4) ; soundex with numbers only result Local $iLoop, $sLookup Local $sReturnValue $sSoundexString = StringUpper(StringStripWS($sSoundexString, 8)) $sReturnValue = _SoundexLookup(StringLeft($sSoundexString, 1)) If $sReturnValue < 0 Then $sReturnValue = 0 For $iLoop = 2 To StringLen($sSoundexString) $sLookup = _SoundexLookup(StringMid($sSoundexString, $iLoop, 1)) If $sLookup = _SoundexLookup(StringMid($sSoundexString, $iLoop - 1, 1)) Then ContinueLoop If $sLookup < 0 Then ContinueLoop $sReturnValue &= $sLookup If StringLen($sReturnValue) >= $iSoundexLength Then ExitLoop Next While StringLen($sReturnValue) < $iSoundexLength $sReturnValue &= "0" WEnd Return $sReturnValue EndFunc Func _LevenshteinDistance($sString1, $sString2) ;Evaluates how many additions/subtractions/substitutions are needed to make the strings identical $sString1 = StringUpper(StringStripWS($sString1, 3)) $sString2 = StringUpper(StringStripWS($sString2, 3)) Local $aiScores[StringLen($sString1) + 1][StringLen($sString2) + 1] Local $iI, $iJ Local $aiEval[2] For $iI = 0 To StringLen($sString1) $aiScores[$iI][0] = 1 Next For $iJ = 0 To StringLen($sString2) $aiScores[0][$iJ] = $iJ Next For $iI = 1 To StringLen($sString1) For $iJ = 1 To StringLen($sString2) If StringMid($sString1, $iI, 1) = StringMid($sString2, $iJ, 1) Then $aiScores[$iI][$iJ] = $aiScores[$iI - 1][$iJ - 1] Else $aiEval[0] = $aiScores[$iI - 1][$iJ] + 1 $aiEval[1] = $aiScores[$iI][$iJ - 1] + 1 If $aiEval[1] < $aiEval[0] Then $aiEval[0] = $aiEval[1] $aiEval[1] = $aiScores[$iI - 1][$iJ - 1] + 1 If $aiEval[1] < $aiEval[0] Then $aiEval[0] = $aiEval[1] $aiScores[$iI][$iJ] = $aiEval[0] EndIf Next Next Return $aiScores[StringLen($sString1)][StringLen($sString2)] EndFunc Func _DamerauLevenshteinDistance($sString1, $sString2) ;Same as _LevenshteinDistance, but also accounts for transposed (switched & adjacent) characters Local $aiScores[StringLen($sString1) + 1][StringLen($sString2) + 1] Local $iI, $iJ Local $aiEval[2] For $iI = 0 To StringLen($sString1) $aiScores[$iI][0] = 1 Next For $iJ = 0 To StringLen($sString2) $aiScores[0][$iJ] = $iJ Next For $iI = 1 To StringLen($sString1) For $iJ = 1 To StringLen($sString2) If StringMid($sString1, $iI, 1) = StringMid($sString2, $iJ, 1) Then $aiScores[$iI][$iJ] = $aiScores[$iI - 1][$iJ - 1] Else $aiEval[0] = $aiScores[$iI - 1][$iJ] + 1 $aiEval[1] = $aiScores[$iI][$iJ - 1] + 1 If $aiEval[1] < $aiEval[0] Then $aiEval[0] = $aiEval[1] $aiEval[1] = $aiScores[$iI - 1][$iJ - 1] + 1 If $aiEval[1] < $aiEval[0] Then $aiEval[0] = $aiEval[1] $aiScores[$iI][$iJ] = $aiEval[0] If ($iI > 1) And ($iJ > 1) And (StringMid($sString1, $iI, 1) = StringMid($sString2, $iJ - 1, 1)) And (StringMid($sString1, $iI - 1, 1) = StringMid($sString2, $iJ, 1)) Then $aiEval[0] = $aiScores[$iI][$iJ] $aiEval[1] = $aiScores[$iI - 2][$iJ - 2] + 1 If $aiEval[0] < $aiEval[1] Then $aiScores[$iI][$iJ] = $aiEval[0] Else $aiScores[$iI][$iJ] = $aiEval[1] EndIf EndIf EndIf Next Next Return $aiScores[StringLen($sString1)][StringLen($sString2)] EndFunc ; +-------------------+ ; | Private Functions | ; +-------------------+ Func _SoundexLookup($sChar) Switch StringUpper($sChar) Case "B", "F", "P", "V" Return 1 Case "C", "G", "J", "K", "Q", "S", "X", "Z" Return 2 Case "D", "T" Return 3 Case "L" Return 4 Case "M", "N" Return 5 Case "R" Return 6 Case "A", "E", "I", "O", "U", "Y" ;Vowels Return -1 Case "H", "W" ; Special case consonants Return -2 Case Else ; Unknown character Return -3 EndSwitch EndFuncand a simple test/sample script to see each at worktest.au3expandcollapse popup#include <PhoneticAlgorithms.au3> Global Const $words[8] = [7, "Hello", "Yellow", "Fellow", "Bellow", "Orange", "Orangutang", "Ejllo"] ;"Ejllo" is not a word, but "Jello" with the first two letters transposed. DoSoundexNARA() DoSoundexNum() DoLD() DoDLD() Func DoSoundexNARA() Local $msg = "" Local $i For $i = 1 To $words[0] $msg &= _SoundexNARA($words[$i]) & " - " & $words[$i] & @CRLF Next MsgBox(0, "Census Soundex Test", $msg) EndFunc Func DoSoundexNum() Local $msg = "" Local $i For $i = 1 To $words[0] $msg &= _SoundexNum($words[$i]) & " - " & $words[$i] & @CRLF Next MsgBox(0, "Numeric Soundex Test", $msg) EndFunc Func DoLD() Local $msg = "" Local $i For $i = 1 To $words[0] $msg &= _LevenshteinDistance("Jello", $words[$i]) & " - Jello <==> " & $words[$i] & @CRLF Next MsgBox(0, "Levenshtein Distance Test", $msg) EndFunc Func DoDLD() Local $msg = "" Local $i For $i = 1 To $words[0] $msg &= _DamerauLevenshteinDistance("Jello", $words[$i]) & " - Jello <==> " & $words[$i] & @CRLF Next MsgBox(0, "Damerau-Levenshtein Distance Test", $msg) EndFunc----- Edit -----I know there have been other versions of these algorithms posted in the past. These are the ones that I personally have been using, and prefer. They are adaptations of old Pascal routines I used years ago. Edited October 11, 2010 by willichan My UDFs: Barcode Libraries, Automate creation of any type of project folder, File Locking with Cooperative Semaphores, Inline binary files, Continue script after reboot, WinWaitMulti, Name Aggregator, Enigma, CornedBeef Hash
kristo Posted June 4, 2013 Posted June 4, 2013 Excellent work Thank you. Cheap, Fast, Good - Choose any two
Recommended Posts
Create an account or sign in to comment
You need to be a member in order to leave a comment
Create an account
Sign up for a new account in our community. It's easy!
Register a new accountSign in
Already have an account? Sign in here.
Sign In Now