Jump to content

Recommended Posts

Posted (edited)

Here are some functions I use for doing phonetic comparisons of words. They can be useful for sorting, spell checking, etc...

PhoneticAlgorithms.au3

#include-once

;;;;  All of these functions are case-insensitive

; +------------------+
; | Public Functions |
; +------------------+

Func _SoundexNARA($sSoundexString, $iSoundexLength=4)
    ; soundex as prescribed by the U.S. National Archives & Records Administration
    Local $iLoop, $sLookup
    Local $sReturnValue
    $sSoundexString = StringUpper(StringStripWS($sSoundexString, 8))
    $sReturnValue = StringLeft($sSoundexString, 1)
    For $iLoop = 2 To StringLen($sSoundexString)
        $sLookup = _SoundexLookup(StringMid($sSoundexString, $iLoop, 1))
        If $sLookup = _SoundexLookup(StringMid($sSoundexString, $iLoop - 1, 1)) Then ContinueLoop
        If $sLookup < 0 Then ContinueLoop
        If $iLoop > 2 Then
            If _SoundexLookup(StringMid($sSoundexString, $iLoop - 2, 1)) > 0 Then
                If _SoundexLookup(StringMid($sSoundexString, $iLoop - 1, 1)) = -2 Then ContinueLoop
            EndIf
        EndIf
        $sReturnValue &= $sLookup
        If StringLen($sReturnValue) >= $iSoundexLength Then ExitLoop
    Next
    While StringLen($sReturnValue) < $iSoundexLength
        $sReturnValue &= "0"
    WEnd
    Return $sReturnValue
EndFunc

Func _SoundexNum($sSoundexString, $iSoundexLength=4)
    ; soundex with numbers only result
    Local $iLoop, $sLookup
    Local $sReturnValue
    $sSoundexString = StringUpper(StringStripWS($sSoundexString, 8))
    $sReturnValue = _SoundexLookup(StringLeft($sSoundexString, 1))
    If $sReturnValue < 0 Then $sReturnValue = 0
    For $iLoop = 2 To StringLen($sSoundexString)
        $sLookup = _SoundexLookup(StringMid($sSoundexString, $iLoop, 1))
        If $sLookup = _SoundexLookup(StringMid($sSoundexString, $iLoop - 1, 1)) Then ContinueLoop
        If $sLookup < 0 Then ContinueLoop
        $sReturnValue &= $sLookup
        If StringLen($sReturnValue) >= $iSoundexLength Then ExitLoop
    Next
    While StringLen($sReturnValue) < $iSoundexLength
        $sReturnValue &= "0"
    WEnd
    Return $sReturnValue
EndFunc

Func _LevenshteinDistance($sString1, $sString2)
    ;Evaluates how many additions/subtractions/substitutions are needed to make the strings identical
    $sString1 = StringUpper(StringStripWS($sString1, 3))
    $sString2 = StringUpper(StringStripWS($sString2, 3))
    Local $aiScores[StringLen($sString1) + 1][StringLen($sString2) + 1]
    Local $iI, $iJ
    Local $aiEval[2]
    For $iI = 0 To StringLen($sString1)
        $aiScores[$iI][0] = 1
    Next
    For $iJ = 0 To StringLen($sString2)
        $aiScores[0][$iJ] = $iJ
    Next
    For $iI = 1 To StringLen($sString1)
        For $iJ = 1 To StringLen($sString2)
            If StringMid($sString1, $iI, 1) = StringMid($sString2, $iJ, 1) Then
                $aiScores[$iI][$iJ] = $aiScores[$iI - 1][$iJ - 1]
            Else
                $aiEval[0] = $aiScores[$iI - 1][$iJ] + 1
                $aiEval[1] = $aiScores[$iI][$iJ - 1] + 1
                If $aiEval[1] < $aiEval[0] Then $aiEval[0] = $aiEval[1]
                $aiEval[1] = $aiScores[$iI - 1][$iJ - 1] + 1
                If $aiEval[1] < $aiEval[0] Then $aiEval[0] = $aiEval[1]
                $aiScores[$iI][$iJ] = $aiEval[0]
            EndIf
        Next
    Next
    Return $aiScores[StringLen($sString1)][StringLen($sString2)]
EndFunc

Func _DamerauLevenshteinDistance($sString1, $sString2)
    ;Same as _LevenshteinDistance, but also accounts for transposed (switched & adjacent) characters
    Local $aiScores[StringLen($sString1) + 1][StringLen($sString2) + 1]
    Local $iI, $iJ
    Local $aiEval[2]
    For $iI = 0 To StringLen($sString1)
        $aiScores[$iI][0] = 1
    Next
    For $iJ = 0 To StringLen($sString2)
        $aiScores[0][$iJ] = $iJ
    Next
    For $iI = 1 To StringLen($sString1)
        For $iJ = 1 To StringLen($sString2)
            If StringMid($sString1, $iI, 1) = StringMid($sString2, $iJ, 1) Then
                $aiScores[$iI][$iJ] = $aiScores[$iI - 1][$iJ - 1]
            Else
                $aiEval[0] = $aiScores[$iI - 1][$iJ] + 1
                $aiEval[1] = $aiScores[$iI][$iJ - 1] + 1
                If $aiEval[1] < $aiEval[0] Then $aiEval[0] = $aiEval[1]
                $aiEval[1] = $aiScores[$iI - 1][$iJ - 1] + 1
                If $aiEval[1] < $aiEval[0] Then $aiEval[0] = $aiEval[1]
                $aiScores[$iI][$iJ] = $aiEval[0]

                If ($iI > 1) And ($iJ > 1) And (StringMid($sString1, $iI, 1) = StringMid($sString2, $iJ - 1, 1)) And (StringMid($sString1, $iI - 1, 1) = StringMid($sString2, $iJ, 1)) Then
                    $aiEval[0] = $aiScores[$iI][$iJ]
                    $aiEval[1] = $aiScores[$iI - 2][$iJ - 2] + 1
                    If $aiEval[0] < $aiEval[1] Then
                        $aiScores[$iI][$iJ] = $aiEval[0]
                    Else
                        $aiScores[$iI][$iJ] = $aiEval[1]
                    EndIf
                EndIf
            EndIf
        Next
    Next
    Return $aiScores[StringLen($sString1)][StringLen($sString2)]
EndFunc

; +-------------------+
; | Private Functions |
; +-------------------+

Func _SoundexLookup($sChar)
    Switch StringUpper($sChar)
        Case "B", "F", "P", "V"
            Return 1
        Case "C", "G", "J", "K", "Q", "S", "X", "Z"
            Return 2
        Case "D", "T"
            Return 3
        Case "L"
            Return 4
        Case "M", "N"
            Return 5
        Case "R"
            Return 6
        Case "A", "E", "I", "O", "U", "Y" ;Vowels
            Return -1
        Case "H", "W" ; Special case consonants
            Return -2
        Case Else ; Unknown character
            Return -3
    EndSwitch
EndFunc

and a simple test/sample script to see each at work

test.au3

#include <PhoneticAlgorithms.au3>
Global Const $words[8] = [7, "Hello", "Yellow", "Fellow", "Bellow", "Orange", "Orangutang", "Ejllo"] ;"Ejllo" is not a word, but "Jello" with the first two letters transposed.

DoSoundexNARA()
DoSoundexNum()
DoLD()
DoDLD()


Func DoSoundexNARA()
    Local $msg = ""
    Local $i
    For $i = 1 To $words[0]
        $msg &= _SoundexNARA($words[$i]) & " - " & $words[$i] & @CRLF
    Next
    MsgBox(0, "Census Soundex Test", $msg)
EndFunc

Func DoSoundexNum()
    Local $msg = ""
    Local $i
    For $i = 1 To $words[0]
        $msg &= _SoundexNum($words[$i]) & " - " & $words[$i] & @CRLF
    Next
    MsgBox(0, "Numeric Soundex Test", $msg)
EndFunc

Func DoLD()
    Local $msg = ""
    Local $i
    For $i = 1 To $words[0]
        $msg &= _LevenshteinDistance("Jello", $words[$i]) & " - Jello <==> " & $words[$i] & @CRLF
    Next
    MsgBox(0, "Levenshtein Distance Test", $msg)
EndFunc

Func DoDLD()
    Local $msg = ""
    Local $i
    For $i = 1 To $words[0]
        $msg &= _DamerauLevenshteinDistance("Jello", $words[$i]) & " - Jello <==> " & $words[$i] & @CRLF
    Next
    MsgBox(0, "Damerau-Levenshtein Distance Test", $msg)
EndFunc

----- Edit -----

I know there have been other versions of these algorithms posted in the past. These are the ones that I personally have been using, and prefer. They are adaptations of old Pascal routines I used years ago.

Edited by willichan
  • 2 years later...

Create an account or sign in to comment

You need to be a member in order to leave a comment

Create an account

Sign up for a new account in our community. It's easy!

Register a new account

Sign in

Already have an account? Sign in here.

Sign In Now
  • Recently Browsing   0 members

    • No registered users viewing this page.
×
×
  • Create New...