2016-02-10 11 views
5

Ich möchte zwei Zeichenketten vergleichen, die Symbole aus verschiedenen Alphabeten enthalten (z. B. Russisch und Englisch). Ich möchte, dass Symbole, die ähnlich aussehen, als gleichwertig betrachtet werden.Zeichenketten mit Symbolen aus verschiedenen Alphabeten vergleichen

z. in dem Wort "Mom" ist der Buchstabe "o" aus dem englischen Alphabet (Code 043E in Unicode), und in der Welt "Mom" ist der Buchstabe "о" aus dem russischen Alphabet (Code 006F in Unicode). Also ("Mom" = "Mоm") => falsch, aber ich möchte es wäre wahr. Gibt es eine Standard-SAS-Funktion oder ich sollte ein Makro dafür schreiben.

Danke!

Antwort

1

würde ich so tun:

Zuerst würde ich Karte machen. Ich meine, welcher Brief in russischer Sprache entspricht welchem ​​Brief in englischer Sprache. Beispiel:
б = b
в = v
...

ich diese Karte in einer separaten Tabelle gespeichert werden soll oder als macroVars. Dann würde ich eine Makro-Schleife mit tranwrd-Funktion erstellen, die durch die Map, die erstellt wurde, durchläuft.

Beispiel könnte hier so sein.

data _null_; 
    stringBefore = "без"; 
    stringAfter = tranwrd(stringBefore,"а","a"); 
    stringAfter = tranwrd(stringAfter,"б","b"); 
    stringAfter = tranwrd(stringAfter,"в","v"); 
... 
run; 

Nach dieser Umwandlung denke ich, dass Sie Ihre Strings vergleichen können.

0

Ich habe auch einige Funktionen codiert, die sich mit Tastatur-Fehldrucken befassen. Hier ist Code:

/***************************************************************************/ 
/* FUNCTION count_rus_letters RETURNS NUMBER OF CYRILLIC LETTERS IN STRING */ 
/***************************************************************************/ 
proc fcmp outlib=sasuser.userfuncs.mystring; 
FUNCTION count_rus_letters(string $); 
length letter $2; 

rus_count=0; 

len=klength(string); 

do i=1 to len; 
    letter=ksubstr(string,i,1); 
    if letter in ("А","а","Б","б","В","в","Г","г","Д","д","Е","е","Ё","ё","Ж","ж" 
     "З","з","И","и","Й","й","К","к","Л","л","М","м","Н","н","О","о","П","п","Р","р", 
     "С","с","Т","т","У","у","Ф","ф","Х","х","Ц","ц","Ч","ч","Ш","ш","Щ","щ","Ъ","ъ" 
     "Ы","ы","Ь","ь","Э","э","Ю","ю","Я","я") 
    then rus_count+1; 
end; 

return(rus_count); 
endsub; 
run; 

/**************************************************************************/ 
/* FUNCTION count_eng_letters RETURNS NUMBER OF ENGLISH LETTERS IN STRING */ 
/**************************************************************************/ 
proc fcmp outlib=sasuser.userfuncs.mystring; 
FUNCTION count_eng_letters(string $); 
length letter $2; 

eng_count=0; 

len=klength(string); 

do i=1 to len; 
    letter=ksubstr(string,i,1); 
    if rank('A') <= rank(letter) <=rank('z') 
    then eng_count+1; 
end; 

return(eng_count); 
endsub; 
run; 

/**************************************************************************/ 
/* FUNCTION is_string_russian RETURNS 1 IF NUMBER OF RUSSIAN SYMBOLS IN */ 
/* STRING >= NUMBER OF ENGLISH SYMBOLS         */ 
/**************************************************************************/ 
proc fcmp outlib=sasuser.userfuncs.mystring; 
FUNCTION is_string_russian(string $); 
length letter $2 result 8; 

eng_count=0; 
rus_count=0; 

len=klength(string); 

do i=1 to len; 
    letter=ksubstr(string,i,1); 
    if letter in ("А","а","Б","б","В","в","Г","г","Д","д","Е","е","Ё","ё","Ж","ж" 
     "З","з","И","и","Й","й","К","к","Л","л","М","м","Н","н","О","о","П","п","Р","р", 
     "С","с","Т","т","У","у","Ф","ф","Х","х","Ц","ц","Ч","ч","Ш","ш","Щ","щ","Ъ","ъ" 
     "Ы","ы","Ь","ь","Э","э","Ю","ю","Я","я") 
    then rus_count+1; 
    if rank('A') <= rank(letter) <=rank('z') 
    then eng_count+1; 
end; 

if rus_count>=eng_count 
then result=1; 
else result=0; 

return(result); 
endsub; 
run; 

/**************************************************************************/ 
/* FUNCTION fix_layout_misprints REPLACES MISPRINTED SYMBOLS BY ANALYSING */ 
/* LANGUAGE OF THE STRING (FOR ENGLISH STRING RUSSIAN SYMBOLS ARE   */ 
/* REPLACED BY ENGLISH COPIES AND FOR RUSSIAN STRING SYMBOLS ARE   */ 
/* REPLACED BY RUSSIAN COPIES)           */ 
/**************************************************************************/ 
proc fcmp outlib=sasuser.userfuncs.mystring; 
FUNCTION fix_layout_misprints(string $) $ 1000; 
length letter $2 result $1000; 

eng_count=0; 
rus_count=0; 

len=klength(string); 

do i=1 to len; 
    letter=ksubstr(string,i,1); 
    if letter in ("А","а","Б","б","В","в","Г","г","Д","д","Е","е","Ё","ё","Ж","ж" 
     "З","з","И","и","Й","й","К","к","Л","л","М","м","Н","н","О","о","П","п","Р","р", 
     "С","с","Т","т","У","у","Ф","ф","Х","х","Ц","ц","Ч","ч","Ш","ш","Щ","щ","Ъ","ъ" 
     "Ы","ы","Ь","ь","Э","э","Ю","ю","Я","я") 
    then rus_count+1; 
    if rank('A') <= rank(letter) <=rank('z') 
    then eng_count+1; 
end; 

if rus_count>=eng_count 
then result=ktranslate(string,"АаВЕеКкМОоРрСсТХх","AaBEeKkMOoPpCcTXx"); 
else result=ktranslate(string,"AaBEeKkMOoPpCcTXx","АаВЕеКкМОоРрСсТХх"); 

return(result); 
endsub; 
run; 

/***********/ 
/* EXAMPLE */ 
/***********/ 
options cmplib=sasuser.userfuncs; 
data _null_; 
good_str="Иванов"; 
err_str="Ивaнов"; 
fixed_str=fix_layout_misprints(err_str); 

put "Good string=" good_str; 
put "Error string=" err_str; 
put "Fixed string=" fixed_str; 

rus_count_in_err=count_rus_letters(err_str); 
put "Count or Cyrillic symbols in error string=" rus_count_in_err; 

eng_count_in_err=count_eng_letters(err_str); 
put "Count or English symbols in error string=" eng_count_in_err; 

is_error_str_russian=is_string_russian(err_str); 
put "Is error string language Russian=" is_error_str_russian; 

if (good_str ne err_str) 
then put "Before clearing - strings are not equal to each other"; 

if (good_str = fixed_str) 
then put "After clearing - strings are equal to each other"; 
run; 
Verwandte Themen