|
0
|
1 |
# Performance note: I benchmarked this code using a set instead of |
|
|
2 |
# a list for the stopwords and was surprised to find that the list |
|
|
3 |
# performed /better/ than the set - maybe because it's only a small |
|
|
4 |
# list. |
|
|
5 |
|
|
|
6 |
stopwords = ''' |
|
|
7 |
i |
|
|
8 |
a |
|
|
9 |
an |
|
|
10 |
are |
|
|
11 |
as |
|
|
12 |
at |
|
|
13 |
be |
|
|
14 |
by |
|
|
15 |
for |
|
|
16 |
from |
|
|
17 |
how |
|
|
18 |
in |
|
|
19 |
is |
|
|
20 |
it |
|
|
21 |
of |
|
|
22 |
on |
|
|
23 |
or |
|
|
24 |
that |
|
|
25 |
the |
|
|
26 |
this |
|
|
27 |
to |
|
|
28 |
was |
|
|
29 |
what |
|
|
30 |
when |
|
|
31 |
where |
|
|
32 |
'''.split() |
|
|
33 |
|
|
|
34 |
def strip_stopwords(sentence): |
|
|
35 |
"Removes stopwords - also normalizes whitespace" |
|
|
36 |
words = sentence.split() |
|
|
37 |
sentence = [] |
|
|
38 |
for word in words: |
|
|
39 |
if word.lower() not in stopwords: |
|
|
40 |
sentence.append(word) |
|
|
41 |
return u' '.join(sentence) |
|
|
42 |
|