--- webalizer.c.orig 2008-07-02 00:23:43.000000000 +0200 +++ webalizer.c 2008-07-05 13:45:11.000000000 +0200 @@ -36,6 +36,7 @@ #include #include #include +#include /* ensure getopt */ #ifdef HAVE_GETOPT_H @@ -255,6 +256,8 @@ char pie_color3[] = "#ff00ff"; /* pie additionnal color 3 */ char pie_color4[] = "#ffc080"; /* pie additionnal color 4 */ +iconv_t cd_from_sjis, cd_from_utf8; + /*********************************************/ /* MAIN - start here */ /*********************************************/ @@ -661,6 +664,9 @@ /* get processing start time */ start_time = time(NULL); + cd_from_sjis = iconv_open("EUC-JP", "Shift_JIS"); + cd_from_utf8 = iconv_open("EUC-JP", "UTF-8"); + /*********************************************/ /* MAIN PROCESS LOOP - read through log file */ /*********************************************/ @@ -1477,6 +1483,9 @@ if (geo_fp) GeoIP_delete(geo_fp); #endif + iconv_close(cd_from_sjis); + iconv_close(cd_from_utf8); + /* Whew, all done! Exit with completion status (0) */ exit(0); } @@ -2079,6 +2088,23 @@ if (!str) return NULL; /* make sure strings valid */ + while(*cp1){ /* for apache log's escape code. */ + if(*cp1 == '\\' && *(cp1+1) == 'x' && + isxdigit(*(cp1+2)) && isxdigit(*(cp1+3))){ + *cp2 = from_hex(*(cp1+2))*16 + from_hex(*(cp1+3)); + if ((*cp2<32)||(*cp2==127)) *cp2='_'; + cp1+=4; cp2++; + + } + else if(*cp1 == '\\' && *(cp1+1) == '\\'){ + *cp2++='\\'; + cp1+=2; + } + else *cp2++ = *cp1++; + } + *cp2=*cp1; + + cp1=cp2=str; while (*cp1) { if (*cp1=='%') /* Found an escape? */ @@ -2111,6 +2137,116 @@ if (*str1==0) return 0; else return 1; } +int score_eucj(unsigned char *str) +{ + int stat=0; + int score=0; + int bad=0; + if(str==NULL) return -1; + + for(; *str!=0;str++){ + switch(stat){ + case 0: + if(*str>= 0x20 && *str <= 0x7e) score++; //ASCII + else if(*str >= 0xa1 && *str <= 0xfe) stat=1; //KANJI(1) + else if(*str == 0x8f); // HOJYO KANJI + else if(*str == 0x8e) stat=2; // KANA + else if(*str < 0x20); //CTRL + else bad=1; + break; + case 1: + if(*str >= 0xa1 && *str <= 0xfe) score += 2; //KANJI(2) + else bad=1; + stat=0; + break; + case 2: + if(*str >= 0xa1 && *str <= 0xdf); //hankaku <- 0 + else bad=1; + stat=0; + break; + } + } + if(bad != 0) score = -1; + return score; +} + +int score_sjis(unsigned char *str) +{ + int stat=0; + int score=0; + int bad=0; + if(str==NULL) return -1; + + for(; *str != 0; str++){ + switch(stat){ + case 0: + if(*str>= 0x20 && *str <= 0x7e) score++;//ASCII + else if((*str >= 0x81 && *str <= 0x9f) || + (*str >= 0xe0 && *str <= 0xfc)) stat=1; //SJIS(1) + else if(*str >= 0xa1 && *str <= 0xdf); // KANA + else if(*str < 0x20); // CTRL + else bad=1; + break; + case 1: + if((*str >= 0x40 && *str <= 0x7e) || + (*str >= 0x80 && *str <= 0xfc)) score += 2; //SJIS(2) + else bad=1; + stat=0; + break; + } + } + if(bad != 0) score = -1; + return score; +} + +int score_utf8(unsigned char *str) +{ + int stat=0; + int score=0; + int bad=0; + if(str==NULL) return -1; + + for(; *str != 0; str++){ + switch(stat){ + case 0: + if(*str>= 0x20 && *str <= 0x7e) score++; //ASCII + else if(*str >= 0xc0 && *str <= 0xdf) stat=1; //greek etc. + else if(*str >= 0xe0 && *str <= 0xef) stat=2; //KANJI etc. + else if(*str >= 0xf0 && *str <= 0xf7) stat=4; + else if(*str < 0x20); //CTRL + else bad=1; + break; + case 1: + if(*str >= 0x80 && *str <= 0xbf) score++; + else bad=1; + stat=0; + break; + case 2: + if(*str >= 0x80 && *str <= 0xbf) stat=3; //KANJI(2) + else {bad=1; stat=0;} + break; + case 3: + if(*str >= 0x80 && *str <= 0xbf) score+=3; //KANJI(3) + else bad=1; + stat=0; + break; + case 4: + case 5: + if(*str >= 0x80 && *str <= 0xbf) stat++; + else {bad=1; stat=0;} + break; + case 6: + if(*str >= 0x80 && *str <= 0xbf) score+=4; + else bad=1; + stat=0; + break; + } + } + if(bad != 0) score = -1; + return score; +} + + /*********************************************/ /* SRCH_STRING - get search strings from ref */ /*********************************************/ @@ -2122,6 +2258,10 @@ char srch[80]=""; unsigned char *cp1, *cp2, *cps; int sp_flg=0; + int sjis, eucj, utf8; + char tmpbuf2[BUFSIZE]; + size_t inlen, outlen; + unsigned char *cp3; /* Check if search engine referrer or return */ if ( (cps=(unsigned char *)isinglist(search_list,log_rec.refer))==NULL) @@ -2160,9 +2300,39 @@ cp1=cp2+strlen((char *)cp2)-1; while (cp1!=cp2) if (isspace((unsigned char)*cp1)) *cp1--='\0'; else break; + utf8=score_utf8(cp2); + sjis=score_sjis(cp2); + eucj=score_eucj(cp2); + if(utf8 >= sjis && utf8 >= eucj){ + iconv(cd_from_utf8, NULL, 0, NULL, 0); + cp3 = cp2; + inlen = strlen(cp2)+1; + cp1 = tmpbuf2; + outlen = sizeof(tmpbuf2); + if(iconv(cd_from_utf8, (char **)&cp3, &inlen, (char**)&cp1, &outlen) >= 0 && + inlen == 0){ + cp2 = tmpbuf2; + } + } + else if(sjis > utf8 && sjis > eucj){ + iconv(cd_from_sjis, NULL, 0, NULL, 0); + cp3 = cp2; + inlen = strlen(cp2)+1; + cp1 = tmpbuf2; + outlen = sizeof(tmpbuf2); + if(iconv(cd_from_sjis, (char **)&cp3, &inlen, (char**)&cp1, &outlen) >= 0 && + inlen == 0){ + cp2 = tmpbuf2; + } + } + /* strip invalid chars */ cp1=cp2; - while (*cp1!=0) { if ((*cp1<32)||(*cp1==127)) *cp1='_'; cp1++; } + while (*cp1!=0) { + if ((*cp1<32)||(*cp1==127)) *cp1='_'; + *cp1=tolower(*cp1); + cp1++; + } if (put_snode((char *)cp2,(u_int64_t)1,sr_htab)) {