diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-12-31 17:38:02 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-12-31 17:38:02 +0000 |
commit | 2375dba898b481837879940dfdcf3ea85248fe01 (patch) | |
tree | cced5fb680e9a362d1de25630bff537865d38365 /contrib/snowball/java/org | |
parent | 1543c98d38ffb84a1e405081436d0a25bee713a6 (diff) | |
download | rspamd-2375dba898b481837879940dfdcf3ea85248fe01.tar.gz rspamd-2375dba898b481837879940dfdcf3ea85248fe01.zip |
Remove bloody submodules.
Diffstat (limited to 'contrib/snowball/java/org')
4 files changed, 547 insertions, 0 deletions
diff --git a/contrib/snowball/java/org/tartarus/snowball/Among.java b/contrib/snowball/java/org/tartarus/snowball/Among.java new file mode 100644 index 000000000..5ed37b503 --- /dev/null +++ b/contrib/snowball/java/org/tartarus/snowball/Among.java @@ -0,0 +1,31 @@ +package org.tartarus.snowball; + +import java.lang.reflect.Method; + +public class Among { + public Among (String s, int substring_i, int result, + String methodname, SnowballProgram methodobject) { + this.s_size = s.length(); + this.s = s.toCharArray(); + this.substring_i = substring_i; + this.result = result; + this.methodobject = methodobject; + if (methodname.length() == 0) { + this.method = null; + } else { + try { + this.method = methodobject.getClass(). + getDeclaredMethod(methodname, new Class[0]); + } catch (NoSuchMethodException e) { + throw new RuntimeException(e); + } + } + } + + public final int s_size; /* search string */ + public final char[] s; /* search string */ + public final int substring_i; /* index to longest matching substring */ + public final int result; /* result of the lookup */ + public final Method method; /* method to use if substring matches */ + public final SnowballProgram methodobject; /* object to invoke method on */ +}; diff --git a/contrib/snowball/java/org/tartarus/snowball/SnowballProgram.java b/contrib/snowball/java/org/tartarus/snowball/SnowballProgram.java new file mode 100644 index 000000000..52d6baa78 --- /dev/null +++ b/contrib/snowball/java/org/tartarus/snowball/SnowballProgram.java @@ -0,0 +1,432 @@ + +package org.tartarus.snowball; +import java.lang.reflect.InvocationTargetException; + +public class SnowballProgram { + protected SnowballProgram() + { + current = new StringBuffer(); + setCurrent(""); + } + + /** + * Set the current string. + */ + public void setCurrent(String value) + { + current.replace(0, current.length(), value); + cursor = 0; + limit = current.length(); + limit_backward = 0; + bra = cursor; + ket = limit; + } + + /** + * Get the current string. + */ + public String getCurrent() + { + String result = current.toString(); + // Make a new StringBuffer. If we reuse the old one, and a user of + // the library keeps a reference to the buffer returned (for example, + // by converting it to a String in a way which doesn't force a copy), + // the buffer size will not decrease, and we will risk wasting a large + // amount of memory. + // Thanks to Wolfram Esser for spotting this problem. + current = new StringBuffer(); + return result; + } + + // current string + protected StringBuffer current; + + protected int cursor; + protected int limit; + protected int limit_backward; + protected int bra; + protected int ket; + + protected void copy_from(SnowballProgram other) + { + current = other.current; + cursor = other.cursor; + limit = other.limit; + limit_backward = other.limit_backward; + bra = other.bra; + ket = other.ket; + } + + protected boolean in_grouping(char [] s, int min, int max) + { + if (cursor >= limit) return false; + char ch = current.charAt(cursor); + if (ch > max || ch < min) return false; + ch -= min; + if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false; + cursor++; + return true; + } + + protected boolean in_grouping_b(char [] s, int min, int max) + { + if (cursor <= limit_backward) return false; + char ch = current.charAt(cursor - 1); + if (ch > max || ch < min) return false; + ch -= min; + if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false; + cursor--; + return true; + } + + protected boolean out_grouping(char [] s, int min, int max) + { + if (cursor >= limit) return false; + char ch = current.charAt(cursor); + if (ch > max || ch < min) { + cursor++; + return true; + } + ch -= min; + if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) { + cursor ++; + return true; + } + return false; + } + + protected boolean out_grouping_b(char [] s, int min, int max) + { + if (cursor <= limit_backward) return false; + char ch = current.charAt(cursor - 1); + if (ch > max || ch < min) { + cursor--; + return true; + } + ch -= min; + if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) { + cursor--; + return true; + } + return false; + } + + protected boolean in_range(int min, int max) + { + if (cursor >= limit) return false; + char ch = current.charAt(cursor); + if (ch > max || ch < min) return false; + cursor++; + return true; + } + + protected boolean in_range_b(int min, int max) + { + if (cursor <= limit_backward) return false; + char ch = current.charAt(cursor - 1); + if (ch > max || ch < min) return false; + cursor--; + return true; + } + + protected boolean out_range(int min, int max) + { + if (cursor >= limit) return false; + char ch = current.charAt(cursor); + if (!(ch > max || ch < min)) return false; + cursor++; + return true; + } + + protected boolean out_range_b(int min, int max) + { + if (cursor <= limit_backward) return false; + char ch = current.charAt(cursor - 1); + if(!(ch > max || ch < min)) return false; + cursor--; + return true; + } + + protected boolean eq_s(int s_size, String s) + { + if (limit - cursor < s_size) return false; + int i; + for (i = 0; i != s_size; i++) { + if (current.charAt(cursor + i) != s.charAt(i)) return false; + } + cursor += s_size; + return true; + } + + protected boolean eq_s_b(int s_size, String s) + { + if (cursor - limit_backward < s_size) return false; + int i; + for (i = 0; i != s_size; i++) { + if (current.charAt(cursor - s_size + i) != s.charAt(i)) return false; + } + cursor -= s_size; + return true; + } + + protected boolean eq_v(CharSequence s) + { + return eq_s(s.length(), s.toString()); + } + + protected boolean eq_v_b(CharSequence s) + { return eq_s_b(s.length(), s.toString()); + } + + protected int find_among(Among v[], int v_size) + { + int i = 0; + int j = v_size; + + int c = cursor; + int l = limit; + + int common_i = 0; + int common_j = 0; + + boolean first_key_inspected = false; + + while(true) { + int k = i + ((j - i) >> 1); + int diff = 0; + int common = common_i < common_j ? common_i : common_j; // smaller + Among w = v[k]; + int i2; + for (i2 = common; i2 < w.s_size; i2++) { + if (c + common == l) { + diff = -1; + break; + } + diff = current.charAt(c + common) - w.s[i2]; + if (diff != 0) break; + common++; + } + if (diff < 0) { + j = k; + common_j = common; + } else { + i = k; + common_i = common; + } + if (j - i <= 1) { + if (i > 0) break; // v->s has been inspected + if (j == i) break; // only one item in v + + // - but now we need to go round once more to get + // v->s inspected. This looks messy, but is actually + // the optimal approach. + + if (first_key_inspected) break; + first_key_inspected = true; + } + } + while(true) { + Among w = v[i]; + if (common_i >= w.s_size) { + cursor = c + w.s_size; + if (w.method == null) return w.result; + boolean res; + try { + Object resobj = w.method.invoke(w.methodobject, + new Object[0]); + res = resobj.toString().equals("true"); + } catch (InvocationTargetException e) { + res = false; + // FIXME - debug message + } catch (IllegalAccessException e) { + res = false; + // FIXME - debug message + } + cursor = c + w.s_size; + if (res) return w.result; + } + i = w.substring_i; + if (i < 0) return 0; + } + } + + // find_among_b is for backwards processing. Same comments apply + protected int find_among_b(Among v[], int v_size) + { + int i = 0; + int j = v_size; + + int c = cursor; + int lb = limit_backward; + + int common_i = 0; + int common_j = 0; + + boolean first_key_inspected = false; + + while(true) { + int k = i + ((j - i) >> 1); + int diff = 0; + int common = common_i < common_j ? common_i : common_j; + Among w = v[k]; + int i2; + for (i2 = w.s_size - 1 - common; i2 >= 0; i2--) { + if (c - common == lb) { + diff = -1; + break; + } + diff = current.charAt(c - 1 - common) - w.s[i2]; + if (diff != 0) break; + common++; + } + if (diff < 0) { + j = k; + common_j = common; + } else { + i = k; + common_i = common; + } + if (j - i <= 1) { + if (i > 0) break; + if (j == i) break; + if (first_key_inspected) break; + first_key_inspected = true; + } + } + while(true) { + Among w = v[i]; + if (common_i >= w.s_size) { + cursor = c - w.s_size; + if (w.method == null) return w.result; + + boolean res; + try { + Object resobj = w.method.invoke(w.methodobject, + new Object[0]); + res = resobj.toString().equals("true"); + } catch (InvocationTargetException e) { + res = false; + // FIXME - debug message + } catch (IllegalAccessException e) { + res = false; + // FIXME - debug message + } + cursor = c - w.s_size; + if (res) return w.result; + } + i = w.substring_i; + if (i < 0) return 0; + } + } + + /* to replace chars between c_bra and c_ket in current by the + * chars in s. + */ + protected int replace_s(int c_bra, int c_ket, String s) + { + int adjustment = s.length() - (c_ket - c_bra); + current.replace(c_bra, c_ket, s); + limit += adjustment; + if (cursor >= c_ket) cursor += adjustment; + else if (cursor > c_bra) cursor = c_bra; + return adjustment; + } + + protected void slice_check() + { + if (bra < 0 || + bra > ket || + ket > limit || + limit > current.length()) // this line could be removed + { + System.err.println("faulty slice operation"); + // FIXME: report error somehow. + /* + fprintf(stderr, "faulty slice operation:\n"); + debug(z, -1, 0); + exit(1); + */ + } + } + + protected void slice_from(String s) + { + slice_check(); + replace_s(bra, ket, s); + } + + protected void slice_from(CharSequence s) + { + slice_from(s.toString()); + } + + protected void slice_del() + { + slice_from(""); + } + + protected void insert(int c_bra, int c_ket, String s) + { + int adjustment = replace_s(c_bra, c_ket, s); + if (c_bra <= bra) bra += adjustment; + if (c_bra <= ket) ket += adjustment; + } + + protected void insert(int c_bra, int c_ket, CharSequence s) + { + insert(c_bra, c_ket, s.toString()); + } + + /* Copy the slice into the supplied StringBuffer */ + protected StringBuffer slice_to(StringBuffer s) + { + slice_check(); + int len = ket - bra; + s.replace(0, s.length(), current.substring(bra, ket)); + return s; + } + + /* Copy the slice into the supplied StringBuilder */ + protected StringBuilder slice_to(StringBuilder s) + { + slice_check(); + int len = ket - bra; + s.replace(0, s.length(), current.substring(bra, ket)); + return s; + } + + protected StringBuffer assign_to(StringBuffer s) + { + s.replace(0, s.length(), current.substring(0, limit)); + return s; + } + + protected StringBuilder assign_to(StringBuilder s) + { + s.replace(0, s.length(), current.substring(0, limit)); + return s; + } + +/* +extern void debug(struct SN_env * z, int number, int line_count) +{ int i; + int limit = SIZE(z->p); + //if (number >= 0) printf("%3d (line %4d): '", number, line_count); + if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit); + for (i = 0; i <= limit; i++) + { if (z->lb == i) printf("{"); + if (z->bra == i) printf("["); + if (z->c == i) printf("|"); + if (z->ket == i) printf("]"); + if (z->l == i) printf("}"); + if (i < limit) + { int ch = z->p[i]; + if (ch == 0) ch = '#'; + printf("%c", ch); + } + } + printf("'\n"); +} +*/ + +}; diff --git a/contrib/snowball/java/org/tartarus/snowball/SnowballStemmer.java b/contrib/snowball/java/org/tartarus/snowball/SnowballStemmer.java new file mode 100644 index 000000000..960bd55f6 --- /dev/null +++ b/contrib/snowball/java/org/tartarus/snowball/SnowballStemmer.java @@ -0,0 +1,7 @@ + +package org.tartarus.snowball; +import java.lang.reflect.InvocationTargetException; + +public abstract class SnowballStemmer extends SnowballProgram { + public abstract boolean stem(); +}; diff --git a/contrib/snowball/java/org/tartarus/snowball/TestApp.java b/contrib/snowball/java/org/tartarus/snowball/TestApp.java new file mode 100644 index 000000000..38803f673 --- /dev/null +++ b/contrib/snowball/java/org/tartarus/snowball/TestApp.java @@ -0,0 +1,77 @@ + +package org.tartarus.snowball; + +import java.lang.reflect.Method; +import java.io.Reader; +import java.io.Writer; +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileInputStream; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.OutputStream; +import java.io.FileOutputStream; + +public class TestApp { + private static void usage() + { + System.err.println("Usage: TestApp <algorithm> <input file> [-o <output file>]"); + } + + public static void main(String [] args) throws Throwable { + if (args.length < 2) { + usage(); + return; + } + + Class stemClass = Class.forName("org.tartarus.snowball.ext." + + args[0] + "Stemmer"); + SnowballStemmer stemmer = (SnowballStemmer) stemClass.newInstance(); + + Reader reader; + reader = new InputStreamReader(new FileInputStream(args[1])); + reader = new BufferedReader(reader); + + StringBuffer input = new StringBuffer(); + + OutputStream outstream; + + if (args.length > 2) { + if (args.length >= 4 && args[2].equals("-o")) { + outstream = new FileOutputStream(args[3]); + } else { + usage(); + return; + } + } else { + outstream = System.out; + } + Writer output = new OutputStreamWriter(outstream); + output = new BufferedWriter(output); + + int repeat = 1; + if (args.length > 4) { + repeat = Integer.parseInt(args[4]); + } + + Object [] emptyArgs = new Object[0]; + int character; + while ((character = reader.read()) != -1) { + char ch = (char) character; + if (Character.isWhitespace((char) ch)) { + if (input.length() > 0) { + stemmer.setCurrent(input.toString()); + for (int i = repeat; i != 0; i--) { + stemmer.stem(); + } + output.write(stemmer.getCurrent()); + output.write('\n'); + input.delete(0, input.length()); + } + } else { + input.append(Character.toLowerCase(ch)); + } + } + output.flush(); + } +} |