/* * select.c - search with suffix array file * * Copyright (C) 2000, 2001, * Nara Institute of Science and Technology * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Nara Institute of * Science and Technology. * 4. The name Nara Institute of Science and Technology may not be used to * endorse or promote products derived from this software without specific * prior written permission. * * * THIS SOFTWARE IS PROVIDED BY Nara Institute of Science and Technology * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Nara Institute * of Science and Technology BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $Id: select.c,v 1.1.1.1 2004/05/03 15:47:28 joerg72 Exp $ */ #include #include #include #include "sufary.h" #include "tokenizer.h" #include "htobe.h" #define MINMIN(X,Y) ((X) < (Y) ? (X) : (Y)) /* * sistring の比較 */ static int cmp_sistr(char *, char *, int *, int); static eresult sa_search(SUFARY *, char *, int, int); #define sa_aryidx2txtptr(ary, idx) \ ((ary)->txtmap + betoh(((long *)((ary)->arymap + (idx) * sizeof(long)))[0])) /* * eresult sa_search(SUFARY *ary, char *s, int keylen, int base_offset); * * purpose * 検索を実行 * * parameters * ary : 検索対象ファイル * s : 検索キーワード * keylen : キーワードの長さ * * return value * プログラム継続モード * * description * array から文字列を探索。 * 2分探査を3回使って範囲検索をする * * abxxiabcdefgxyzpoop * ary->left hipdabbbbbbbbbbxaba ary->right * | bbbcfffffffffffhllm | * ------------------##########------------- * || || * l_out l_in r_in r_out * * キーワード "fb" で検索した例。 * 検索結果(範囲)は left_inside 〜 right_inside ( # で示した部分 ) * * [重要] left, right は必ず検索範囲の内側を指す。 980319 * * ・書き変わるもの * ary->ee : エラーコード(どんなエラーが発生したか) * ary->right, ary->left : 自動的に次の検索範囲を狭める * */ static eresult sa_search(SUFARY * ary, char *s, int keylen, int base_offset) { long left_outside, right_outside, left_inside, right_inside, cur, tmp; int hr; int prefix_length_L = base_offset; int prefix_length_R = base_offset; int offset = 0, diffpos /* 異なり位置 */ ; if (ary == NULL || ary->arymap == NULL) { fprintf(stderr, "specify target files first.\n"); return FAIL; } /* * 検索範囲初期設定 */ right_outside = ary->right + 1; left_outside = ary->left - 1; right_inside = ary->right; left_inside = ary->left; /* * step 1. Match する点を見つける。 */ cur = (right_outside - left_outside) / 2 + left_outside; while (1) { offset = MINMIN(prefix_length_L, prefix_length_R); hr = cmp_sistr(sa_aryidx2txtptr(ary, cur) + offset, s + offset, &diffpos, keylen - offset); if (hr < 0) { /* LESS */ left_outside = cur; prefix_length_L = offset + diffpos; } else if (hr > 0) { /* ABOVE */ right_outside = cur; prefix_length_R = offset + diffpos; } else { /* MATCH ... if (hr == 0) */ left_inside = right_inside = cur; break; } tmp = (right_outside - left_outside) / 2 + left_outside; /* * left_outside は -1 の可能性あり。 ∴ tmp も -1 になることがある * 980319 */ if (cur == tmp || tmp < ary->left) return FAIL; /* 見つからなかった... */ cur = tmp; } /* * step 2. right_inside を確定する */ offset = prefix_length_R; cur = (right_outside - right_inside) / 2 + right_inside; while (1) { hr = cmp_sistr(sa_aryidx2txtptr(ary, cur) + offset, s + offset, &diffpos, keylen - offset); if (hr > 0) { /* ABOVE */ right_outside = cur; offset += diffpos; } else if (hr == 0) { /* MATCH */ right_inside = cur; } else { /* LESS ... if (hr < 0) */ ary->ee = STRUCTURE; return _ERROR; } tmp = (right_outside - right_inside) / 2 + right_inside; if (cur == tmp) break; cur = tmp; } /* * step 3. left_inside を確定する */ offset = prefix_length_L; cur = left_inside - (left_inside - left_outside) / 2; /* 980319 */ if (cur < 0) cur = 0; while (1) { hr = cmp_sistr(sa_aryidx2txtptr(ary, cur) + offset, s + offset, &diffpos, keylen - offset); if (hr < 0) { /* LESS */ left_outside = cur; offset += diffpos; } else if (hr == 0) { /* MATCH */ left_inside = cur; } else { /* ABOVE ... if (hr > 0) */ ary->ee = STRUCTURE; return _ERROR; } tmp = left_inside - (left_inside - left_outside) / 2; /* 980319 */ if (tmp < 0) tmp = 0; if (cur == tmp) break; cur = tmp; } /* * ary->left, ary->right の再設定 */ ary->left = left_inside; ary->right = right_inside; return CONT; } /* * int cmp_sistr(char *txt, char *str, int *diffpos, int len); * * purpose * sistring の比較 * * parameters * txt : 比較される文字列(検索対象テキスト) * str : 比較する文字列(検索キーワード) * diffpos : 何文字目まで同じだったか? 始めて異なった位置(文字目) * 書き換えて、返し値として使用。 * len : 比較する文字数 * * return value (int) * 0 : MATCH 一致 * - : LESS sistring(pos) < str ('abc...' < 'ccc') * + : ABOVE sistring(pos) > str ('abc...' > 'aaa') * * description * 980422 に大改造 * */ static int cmp_sistr(char *txt, char *str, int *diffpos, int len) { int i; for (i = 0; i < len; i++, txt++, str++) if (*txt != *str) { *diffpos = i; return ((unsigned char) *txt - (unsigned char) *str); } *diffpos = len; return 0; } /* * sa_reset(SUFARY *ary); * * purpose * SUFARY型変数aryのrightとleftを元に戻す * * parameters * ary : 対象array */ #define sa_reset(ary) \ { (ary)->left = 0; \ (ary)->right = (ary)->arraysize - 1; } #define mbclen(mb) \ ((((unsigned char)(mb) & 0x80)) ? 2 : 1) /* * char **sa_common_prefix_search(SUFARY *ary, * char *pattern, * int pattern_len, * char **result); * * purpose * Suffix Array を TRIE とみなして、形態素解析サーチを行なう * * parameters * ary : 対象array * pattern : 検索キーワード * pattern_len : pattern のバイト長 * result : 検索結果を格納するバッファ * * return value * テキストインデックス(long)の配列へのポインタ * * description * [Common Prefix Search とは...] 検索キーワード*に*Prefixマッチする * 文字列を取り出す。「検索キーワードが」ではないよ。 * 例: 辞書 a, abc, any, anybody, anymore, ... * 検索キーワード anybody * 結果 a, any, anybody */ char ** sa_common_prefix_search(SUFARY * ary, char *pattern, int pattern_len, char **result) { int cursor; int result_last = 0; long tmp; sa_reset(ary); cursor = 0; while (1) { int next = cursor + cha_tok_mblen(Cha_tokenizer, pattern + cursor, pattern_len - cursor); if (sa_search(ary, pattern, next, cursor) != CONT) break; for (tmp = ary->left; tmp <= ary->right; tmp++) { char *entry = sa_aryidx2txtptr(ary, tmp); if (entry[next] != '\0') break; result[result_last++] = entry; } cursor = next; } result[result_last] = NULL; return result; }