mirror of
https://github.com/djohnlewis/stackdump
synced 2025-12-16 21:03:26 +00:00
Compare commits
40 Commits
comments-i
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ea8aefcaf7 | ||
|
|
a10a6d1e4d | ||
|
|
4616d04c56 | ||
|
|
f53efd1422 | ||
|
|
20693a8764 | ||
|
|
f20e281d3d | ||
|
|
0f16cd4bce | ||
|
|
2d27d3efe4 | ||
|
|
3f29d45964 | ||
|
|
dcc7203c97 | ||
|
|
6a7b8ea432 | ||
|
|
c020660479 | ||
|
|
7f6ed7b438 | ||
|
|
a06d2a4c55 | ||
|
|
55bec19665 | ||
|
|
a59e3b59d0 | ||
|
|
40121f2600 | ||
|
|
db026d2ccc | ||
|
|
ae6e10e6c4 | ||
|
|
f79df598d3 | ||
|
|
4d6343584a | ||
|
|
9d1d6b135a | ||
|
|
96b06f7b35 | ||
|
|
28d79ea089 | ||
|
|
ce7edf1ca0 | ||
|
|
4254f31859 | ||
|
|
c11fcfacf6 | ||
|
|
7764f088c2 | ||
|
|
a4c6c2c7ba | ||
|
|
01f9b10c27 | ||
|
|
cdb93e6f68 | ||
|
|
0990e00852 | ||
|
|
92e359174a | ||
|
|
c521fc1627 | ||
|
|
722d4125e7 | ||
|
|
ce3eb04270 | ||
|
|
9613caa8d1 | ||
|
|
2583afeb90 | ||
|
|
522e1ff4f2 | ||
|
|
36eb8d3980 |
31
.gitignore
vendored
Normal file
31
.gitignore
vendored
Normal file
@@ -0,0 +1,31 @@
|
||||
^JAVA_CMD$
|
||||
^PYTHON_CMD$
|
||||
|
||||
.DS_Store
|
||||
|
||||
# ignore any data
|
||||
data
|
||||
|
||||
# ignore working bytecode
|
||||
\.class$
|
||||
\.pyc$
|
||||
|
||||
^datadump/*
|
||||
|
||||
# ignore test and tutorial directories
|
||||
test/*$
|
||||
tests/*$
|
||||
testsuite/*$
|
||||
tutorial/*$
|
||||
|
||||
# Solr/Jetty
|
||||
^java/solr/server/solr-webapp/*
|
||||
^java/solr/server/logs/*
|
||||
|
||||
# ignore the downloaded logos
|
||||
^python/media/images/logos48
|
||||
^python/media/images/icons
|
||||
^python/media/images/badges
|
||||
|
||||
# PyCharm project files
|
||||
^.idea/
|
||||
3
.idea/.gitignore
generated
vendored
Normal file
3
.idea/.gitignore
generated
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
@@ -0,0 +1,6 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
||||
8
.idea/modules.xml
generated
Normal file
8
.idea/modules.xml
generated
Normal file
@@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/stackdump.iml" filepath="$PROJECT_DIR$/.idea/stackdump.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
||||
8
.idea/stackdump.iml
generated
Normal file
8
.idea/stackdump.iml
generated
Normal file
@@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
||||
6
.idea/vcs.xml
generated
Normal file
6
.idea/vcs.xml
generated
Normal file
@@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
||||
105
README.textile
105
README.textile
@@ -1,3 +1,8 @@
|
||||
Tutorial (Thai language) : http://www.youtube.com/watch?v=jHEBlHxEKeM
|
||||
"http://blog.sornram9254.com/stackdump-an-offline-browser-for-stackexchange-stackverflow/":http://blog.sornram9254.com/stackdump-an-offline-browser-for-stackexchange-stackverflow/
|
||||
|
||||
- - - - - - - - - - - - - - - - - - - ORIGINAL README - - - - - - - - - - - - - - - - - - -
|
||||
|
||||
h1. Stackdump - an offline browser for StackExchange sites.
|
||||
|
||||
Stackdump was conceived for those who work in environments that do not have easy access to the StackExchange family of websites. It allows you to host a read-only instance of the StackExchange sites locally, accessible via a web browser.
|
||||
@@ -12,13 +17,15 @@ h2. Screenshots
|
||||
|
||||
h2. System Requirements
|
||||
|
||||
Stackdump was written in Python and requires Python 2.5 or later (but not Python 3). It leverages Apache Solr, which requires the Java runtime (JRE), version 6 or later.
|
||||
Stackdump was written in Python and requires Python 2.5 or later (but not Python 3). There have been some reported issues with older versions of Python, so the ideal version to run is v2.7.6 (the latest 2.x version, as of writing). Stackdump also leverages Apache Solr, which requires the Java runtime (JRE), version 6 or later.
|
||||
|
||||
Besides that, there are no OS-dependent dependencies and should work on any platform that Python and Java run on (although it only comes bundled with Linux scripts at the moment). It was, however, developed and tested on CentOS 5 running Python 2.7 and JRE 6 update 27.
|
||||
|
||||
You will also need "7-zip":http://www.7-zip.org/ to extract the data dump files, but Stackdump does not use it directly so you can perform the extraction on another machine first.
|
||||
|
||||
It is recommended that Stackdump be run on a system with at least 3GB of RAM, particularly if you intend to import StackOverflow into Stackdump. Apache Solr requires a fair bit of memory during the import process. It should also have a fair bit of space available; having at least roughly the space used by the raw, extracted, data dump XML files is a good rule of thumb (note that once imported, the raw data dump XML files are not needed by Stackdump any more).
|
||||
The amount of memory required for Stackdump depends on which dataset you want to import. For most datasets, at least 3GB of RAM is preferable. If you want to import StackOverflow, you must use a 64-bit operating system and a 64-bit version of Python, and also have at least 6GB of RAM available (or swap). If you do not have enough RAM available, the import process will likely fail with a _MemoryError_ message at some point.
|
||||
|
||||
Make sure you have enough disk space too - having at least roughly the space used by the raw, extracted, data dump XML files available is a good rule of thumb (note that once imported, the raw data dump XML files are not needed by Stackdump any more).
|
||||
|
||||
Finally, Stackdump has been tested and works in the latest browsers (IE9, FF10+, Chrome, Safari). It degrades fairly gracefully in older browsers, although some will have rendering issues, e.g. IE8.
|
||||
|
||||
@@ -28,24 +35,59 @@ Version 1.1 fixes a few bugs, the major one being the inability to import the 20
|
||||
|
||||
Because changes have been made to the search schema and the search indexer has been upgraded (to Solr 4.5), all data will need to be re-indexed. Therefore there is no upgrade path; follow the instructions below to set up Stackdump again. It is recommended to install this new version in a new directory, instead of overwriting the existing one.
|
||||
|
||||
h2. Changes and upgrading from v1.1 to v1.2
|
||||
|
||||
The major change in the v1.2 release are improvements to the speed of importing data. There are some other smaller changes, including new PowerShell scripts to start and manage Stackdump on Windows as well as a few bug fixes when running on Windows. The search indexing side of things has not changed, therefore data imported using v1.1 will continue to work in v1.2. _Data from older versions however, needs to be re-indexed. See the above section on upgrading to v1.1 for more details._
|
||||
|
||||
h2. Changes and upgrading from v1.2 to v1.3
|
||||
|
||||
v1.3 is primarily bugfix release, for a fairly serious bug. It turns out Stackdump has been subtly overwriting questions as more sites are imported because it assumed post IDs were unique across all sites, when they in fact were not. This meant as more sites were imported, the previous sites started to lose questions. The fix required a change to search index, therefore *the data directory will need to be deleted and all data will need to be re-imported after installing this version*. Thanks to @yammesicka for reporting the issue.
|
||||
|
||||
Other changes include a new setting to allow disabling the link and image URL rewriting, and a change to the @import_site@ command so it doesn't bail immediately if there is a Solr connection issue - it will prompt and allow resumption after the connection issue has been resolved.
|
||||
|
||||
h3. Importing the StackOverflow data dump, September 2013
|
||||
|
||||
The StackOverflow data dump has grown significantly since I started this project back in 2011. With the improvements in v1.2, on a VM with two cores and 4GB of RAM running CentOS 5.7 on a single, standard hard drive containing spinning pieces of metal,
|
||||
|
||||
* it took *84719.565491 seconds* to import it, or *23 hours, 31 minutes and 59.565491 seconds*
|
||||
* once completed, it used up *20GB* of disk space
|
||||
* during the import, roughly *30GB* of disk space was needed
|
||||
* the import process used, at max, around *2GB* of RAM.
|
||||
|
||||
In total, the StackOverflow data dump has *15,933,529 posts* (questions and answers), *2,332,403 users* and a very large number of comments.
|
||||
|
||||
I attempted this on a similarly spec'ed Windows 7 64-bit VM as well - 23 hours later and it is still trying to process the comments. The SQLite, Python or just disk performance is very poor for some reason. Therefore, if you intend on importing StackOverflow, I would advise you to run Stackdump on Linux instead. The smaller sites all complete without a reasonable time though, and there are no perceptible issues with performance as far as I'm aware on Windows.
|
||||
|
||||
h3. Reports on importing the StackOverflow data dump, September 2014
|
||||
|
||||
Due to the growth of the dataset, the import process now requires at least 6GB of RAM. This also means you must use a 64-bit operating system and a 64-bit version of Python.
|
||||
|
||||
h2. Setting up
|
||||
|
||||
Stackdump was designed for offline environments or environments with poor internet access, therefore it is bundled with all the dependencies it requires (with the exception of Python, Java and 7-zip).
|
||||
|
||||
As long as you have:
|
||||
* "Python":http://python.org/download/,
|
||||
* "Java":http://java.com/en/download/manual.jsp,
|
||||
* "Python":http://python.org/download/, version 2.5 or later but not version 3 (tested with v2.7.6),
|
||||
* "Java":http://java.com/en/download/manual.jsp, version 6 (1.6) or later,
|
||||
* "Stackdump":https://bitbucket.org/samuel.lai/stackdump/downloads,
|
||||
* the "StackExchange Data Dump":http://www.clearbits.net/creators/146-stack-exchange-data-dump (Note: this is only available as a torrent), and
|
||||
* the "StackExchange Data Dump":https://archive.org/details/stackexchange (download the sites you wish to import - note that StackOverflow is split into 7 archive files; only Comments, Posts and Users are required but after extraction the files need to be renamed to Comments.xml, Posts.xml and Users.xml respectively), and
|
||||
* "7-zip":http://www.7-zip.org/ (needed to extract the data dump files)
|
||||
|
||||
...you should be able to get an instance up and running.
|
||||
|
||||
If you are using a 64-bit operating system, get the 64-bit version of Python.
|
||||
|
||||
To provide a better experience, Stackdump can use the RSS feed content to pre-fill some of the required details during the import process, as well as to display the site logos in the app. Stackdump comes bundled with a script that downloads and places these bits in the right places. If you're in a completely offline environment however, it may be worth running this script on a connected box first.
|
||||
|
||||
h3. Windows users
|
||||
|
||||
If you're using Windows, you will need to substitute the appropriate PowerShell equivalent command for the Stackdump scripts used below. These equivalent PowerShell scripts are in the Stackdump root directory, alongside their Unix counterparts. The names are roughly the same, with the exception of @manage.sh@, which in PowerShell has been broken up into two scripts, @List-StackdumpCommands.ps1@ and @Run-StackdumpCommand.ps1@.
|
||||
|
||||
Remember to set your PowerShell execution policy to at least @RemoteSigned@ first as these scripts are not signed. Use the @Get-ExecutionPolicy@ cmdlet to see the current policy, and @Set-ExecutionPolicy@ to set it. You will need to have administrative privileges to set it.
|
||||
|
||||
h3. Extract Stackdump
|
||||
|
||||
Stackdump was to be self-contained, so to get it up and running, simply extract the Stackdump download to an appropriate location.
|
||||
Stackdump was designed to be self-contained, so to get it up and running, simply extract the Stackdump download archive to an appropriate location.
|
||||
|
||||
h3. Verify dependencies
|
||||
|
||||
@@ -58,7 +100,7 @@ bq. If you're using Java 7 on Linux and you see an error similar to the followin
|
||||
this is because you have SELinux enabled. You will need to tell SELinux to allow Java to run by using the following command as root (amending the path as necessary) -
|
||||
@chcon -t textrel_shlib_t /opt/jre1.7.0_40/lib/i386/server/libjvm.so@
|
||||
|
||||
Then type @python -V@ and check that it is version 2.5 or later (and not Python 3).
|
||||
Then type @python -V@ and check that it is version 2.5 or later (and not Python 3). Ideally this should be v2.7.6 or later as there have been some reported issues with earlier versions.
|
||||
|
||||
If you would rather not put these versions in the PATH (e.g. you don't want to override the default version of Python in your Linux distribution), you can tell Stackdump which Java and/or Python to use explicitly by creating a file named @JAVA_CMD@ or @PYTHON_CMD@ respectively in the Stackdump root directory, and placing the path to the executable in there.
|
||||
|
||||
@@ -87,15 +129,15 @@ To start the import process, execute the following command -
|
||||
|
||||
@stackdump_dir/manage.sh import_site --base-url site_url --dump-date dump_date path_to_xml_files@
|
||||
|
||||
... where site_url is the URL of the site you're importing, e.g. __android.stackexchange.com__; dump_date is the date of the data dump you're importing, e.g. __August 2012__, and finally path_to_xml_files is the path to the XML files you just extracted. The dump_date is a text string that is shown in the app only, so it can be in any format you want.
|
||||
... where @site_url@ is the URL of the site you're importing, e.g. __android.stackexchange.com__; @dump_date@ is the date of the data dump you're importing, e.g. __August 2012__, and finally @path_to_xml_files@ is the path to the directory containing the XML files that were just extracted. The @dump_date@ is a text string that is shown in the app only, so it can be in any format you want.
|
||||
|
||||
For example, to import the August 2012 data dump of the Android StackExchange site, you would execute -
|
||||
For example, to import the August 2012 data dump of the Android StackExchange site, with the files extracted into @/tmp/android@, you would execute -
|
||||
|
||||
@stackdump_dir/manage.sh import_site --base-url android.stackexchange.com --dump-date "August 2012" /tmp/android@
|
||||
|
||||
It is normal to get messages about unknown PostTypeIds and missing comments and answers. These errors are likely due to those posts being hidden via moderation.
|
||||
|
||||
This can take anywhere between a minute to 10 hours or more depending on the site you're importing. As a rough guide, __android.stackexchange.com__ took a minute on my VM, while __stackoverflow.com__ took just over 10 hours.
|
||||
This can take anywhere between a minute to 20 hours or more depending on the site you're importing. As a rough guide, __android.stackexchange.com__ took a minute on my VM, while __stackoverflow.com__ took just under 24 hours.
|
||||
|
||||
Repeat these steps for each site you wish to import. Do not attempt to import multiple sites at the same time; it will not work and you may end up with half-imported sites.
|
||||
|
||||
@@ -109,19 +151,53 @@ To start Stackdump, execute the following command -
|
||||
|
||||
... and visit port 8080 on that machine. That's it - your own offline, read-only instance of StackExchange.
|
||||
|
||||
If you need to change the port that it runs on, modify @stackdump_dir/python/src/stackdump/settings.py@ and restart the app.
|
||||
If you need to change the port that it runs on, or modify other settings that control how Stackdump works; see the 'Optional configuration' section below for more details.
|
||||
|
||||
The aforementioned @settings.py@ file also contains some other settings that control how Stackdump works.
|
||||
Both the search indexer and the app need to be running for Stackdump to work.
|
||||
|
||||
h2. Optional configuration
|
||||
|
||||
There are a few settings for those who like to tweak. There's no need to adjust them normally though; the default settings should be fine.
|
||||
|
||||
The settings file is located in @stackdump_dir/python/src/stackdump/settings.py@. The web component will need to be restarted after changes have been made for them to take effect.
|
||||
|
||||
* *SERVER_HOST* - the network interface to run the Stackdump web app on. Use _'0.0.0.0'_ for all interfaces, or _'127.0.0.1'_ for localhost only. By default, it runs on all interfaces.
|
||||
* *SERVER_PORT* - the port to run the Stackdump web app on. The default port is _8080_.
|
||||
* *SOLR_URL* - the URL to the Solr instance. The default assumes Solr is running on the same system. Change this if Solr is running on a different system.
|
||||
* *NUM_OF_DEFAULT_COMMENTS* - the number of comments shown by default for questions and answers before the remaining comments are hidden (and shown when clicked). The default is _3_ comments.
|
||||
* *NUM_OF_RANDOM_QUESTIONS* - the number of random questions shown on the home page of Stackdump and the site pages. The default is _3_ questions.
|
||||
* *REWRITE_LINKS_AND_IMAGES* - by default, all links are rewritten to either point internally or be marked as an external link, and image URLs are rewritten to point to a placeholder image. Set this setting to _False_ to disable this behaviour.
|
||||
|
||||
h2. Running Stackdump as a service
|
||||
|
||||
Stackdump comes bundled with some init.d scripts as well which were tested on CentOS 5. These are located in the @init.d@ directory. To use these, you will need to modify them to specify the path to the Stackdump root directory and the user to run under.
|
||||
|
||||
Both the search indexer and the app need to be running for Stackdump to work.
|
||||
Another option is to use "Supervisor":http://supervisord.org/ with a simple configuration file, e.g.,
|
||||
|
||||
bc.. [program:stackdump-solr]
|
||||
command=/path/to/stackdump/start_solr.sh
|
||||
priority=900
|
||||
user=stackdump_user
|
||||
stopasgroup=true
|
||||
stdout_logfile=/path/to/stackdump/solr_stdout.log
|
||||
stderr_logfile=/path/to/stackdump/solr_stderr.log
|
||||
|
||||
[program:stackdump-web]
|
||||
command=/path/to/stackdump/start_web.sh
|
||||
user=stackdump_user
|
||||
stopasgroup=true
|
||||
stdout_logfile=/path/to/stackdump/web_stdout.log
|
||||
stderr_logfile=/path/to/stackdump/web_stderr.log
|
||||
|
||||
p. Supervisor v3.0b1 or later is required, due to the _stopasgroup_ parameter. Without this parameter, Supervisor will not be able to stop the Stackdump components properly as they're being executed from a script.
|
||||
|
||||
Yet another option for those using newer Linux distributions is to create native "systemd service definitions":http://www.freedesktop.org/software/systemd/man/systemd.service.html of type _simple_ for each of the components.
|
||||
|
||||
h2. Maintenance
|
||||
|
||||
Stackdump stores all its data in the @data@ directory under its root directory. If you want to start fresh, just stop the app and the search indexer, delete that directory and restart the app and search indexer.
|
||||
|
||||
To delete certain sites from Stackdump, use the manage_sites management command -
|
||||
To delete certain sites from Stackdump, use the @manage_sites@ management command -
|
||||
|
||||
@stackdump_dir/manage.sh manage_sites -l@ to list the sites (and their site keys) currently in the system;
|
||||
@stackdump_dir/manage.sh manage_sites -d site_key@ to delete a particular site.
|
||||
@@ -144,6 +220,7 @@ Stackdump leverages several open-source projects to do various things, including
|
||||
* "markdown":http://pypi.python.org/pypi/Markdown for rendering comments
|
||||
* "mathjax":http://www.mathjax.org/ for displaying mathematical expressions properly
|
||||
* "httplib2":http://code.google.com/p/httplib2/ as a dependency of pysolr
|
||||
* "requests":https://github.com/kennethreitz/requests/ as a dependency of pysolr
|
||||
* "Apache Solr":http://lucene.apache.org/solr/ for search functionality
|
||||
|
||||
h2. Things not supported... yet
|
||||
|
||||
@@ -1,50 +0,0 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
Example SolrCore Instance Directory
|
||||
=============================
|
||||
|
||||
This directory is provided as an example of what an "Instance Directory"
|
||||
should look like for a SolrCore
|
||||
|
||||
It's not strictly necessary that you copy all of the files in this
|
||||
directory when setting up a new SolrCores, but it is recommended.
|
||||
|
||||
|
||||
Basic Directory Structure
|
||||
-------------------------
|
||||
|
||||
The Solr Home directory typically contains the following sub-directories...
|
||||
|
||||
conf/
|
||||
This directory is mandatory and must contain your solrconfig.xml
|
||||
and schema.xml. Any other optional configuration files would also
|
||||
be kept here.
|
||||
|
||||
data/
|
||||
This directory is the default location where Solr will keep your
|
||||
index, and is used by the replication scripts for dealing with
|
||||
snapshots. You can override this location in the
|
||||
conf/solrconfig.xml. Solr will create this directory if it does not
|
||||
already exist.
|
||||
|
||||
lib/
|
||||
This directory is optional. If it exists, Solr will load any Jars
|
||||
found in this directory and use them to resolve any "plugins"
|
||||
specified in your solrconfig.xml or schema.xml (ie: Analyzers,
|
||||
Request Handlers, etc...). Alternatively you can use the <lib>
|
||||
syntax in conf/solrconfig.xml to direct Solr to your plugins. See
|
||||
the example conf/solrconfig.xml file for details.
|
||||
@@ -1,24 +0,0 @@
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<!-- The content of this page will be statically included into the top-
|
||||
right box of the cores overview page. Uncomment this as an example to
|
||||
see there the content will show up.
|
||||
|
||||
<img src="img/ico/construction.png"> This line will appear at the top-
|
||||
right box on collection1's Overview
|
||||
-->
|
||||
@@ -1,25 +0,0 @@
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<!-- admin-extra.menu-bottom.html -->
|
||||
<!--
|
||||
<li>
|
||||
<a href="#" style="background-image: url(img/ico/construction.png);">
|
||||
LAST ITEM
|
||||
</a>
|
||||
</li>
|
||||
-->
|
||||
@@ -1,25 +0,0 @@
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<!-- admin-extra.menu-top.html -->
|
||||
<!--
|
||||
<li>
|
||||
<a href="#" style="background-image: url(img/ico/construction.png);">
|
||||
FIRST ITEM
|
||||
</a>
|
||||
</li>
|
||||
-->
|
||||
@@ -1,67 +0,0 @@
|
||||
<?xml version="1.0" ?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<!-- Example exchange rates file for CurrencyField type named "currency" in example schema -->
|
||||
|
||||
<currencyConfig version="1.0">
|
||||
<rates>
|
||||
<!-- Updated from http://www.exchangerate.com/ at 2011-09-27 -->
|
||||
<rate from="USD" to="ARS" rate="4.333871" comment="ARGENTINA Peso" />
|
||||
<rate from="USD" to="AUD" rate="1.025768" comment="AUSTRALIA Dollar" />
|
||||
<rate from="USD" to="EUR" rate="0.743676" comment="European Euro" />
|
||||
<rate from="USD" to="BRL" rate="1.881093" comment="BRAZIL Real" />
|
||||
<rate from="USD" to="CAD" rate="1.030815" comment="CANADA Dollar" />
|
||||
<rate from="USD" to="CLP" rate="519.0996" comment="CHILE Peso" />
|
||||
<rate from="USD" to="CNY" rate="6.387310" comment="CHINA Yuan" />
|
||||
<rate from="USD" to="CZK" rate="18.47134" comment="CZECH REP. Koruna" />
|
||||
<rate from="USD" to="DKK" rate="5.515436" comment="DENMARK Krone" />
|
||||
<rate from="USD" to="HKD" rate="7.801922" comment="HONG KONG Dollar" />
|
||||
<rate from="USD" to="HUF" rate="215.6169" comment="HUNGARY Forint" />
|
||||
<rate from="USD" to="ISK" rate="118.1280" comment="ICELAND Krona" />
|
||||
<rate from="USD" to="INR" rate="49.49088" comment="INDIA Rupee" />
|
||||
<rate from="USD" to="XDR" rate="0.641358" comment="INTNL MON. FUND SDR" />
|
||||
<rate from="USD" to="ILS" rate="3.709739" comment="ISRAEL Sheqel" />
|
||||
<rate from="USD" to="JPY" rate="76.32419" comment="JAPAN Yen" />
|
||||
<rate from="USD" to="KRW" rate="1169.173" comment="KOREA (SOUTH) Won" />
|
||||
<rate from="USD" to="KWD" rate="0.275142" comment="KUWAIT Dinar" />
|
||||
<rate from="USD" to="MXN" rate="13.85895" comment="MEXICO Peso" />
|
||||
<rate from="USD" to="NZD" rate="1.285159" comment="NEW ZEALAND Dollar" />
|
||||
<rate from="USD" to="NOK" rate="5.859035" comment="NORWAY Krone" />
|
||||
<rate from="USD" to="PKR" rate="87.57007" comment="PAKISTAN Rupee" />
|
||||
<rate from="USD" to="PEN" rate="2.730683" comment="PERU Sol" />
|
||||
<rate from="USD" to="PHP" rate="43.62039" comment="PHILIPPINES Peso" />
|
||||
<rate from="USD" to="PLN" rate="3.310139" comment="POLAND Zloty" />
|
||||
<rate from="USD" to="RON" rate="3.100932" comment="ROMANIA Leu" />
|
||||
<rate from="USD" to="RUB" rate="32.14663" comment="RUSSIA Ruble" />
|
||||
<rate from="USD" to="SAR" rate="3.750465" comment="SAUDI ARABIA Riyal" />
|
||||
<rate from="USD" to="SGD" rate="1.299352" comment="SINGAPORE Dollar" />
|
||||
<rate from="USD" to="ZAR" rate="8.329761" comment="SOUTH AFRICA Rand" />
|
||||
<rate from="USD" to="SEK" rate="6.883442" comment="SWEDEN Krona" />
|
||||
<rate from="USD" to="CHF" rate="0.906035" comment="SWITZERLAND Franc" />
|
||||
<rate from="USD" to="TWD" rate="30.40283" comment="TAIWAN Dollar" />
|
||||
<rate from="USD" to="THB" rate="30.89487" comment="THAILAND Baht" />
|
||||
<rate from="USD" to="AED" rate="3.672955" comment="U.A.E. Dirham" />
|
||||
<rate from="USD" to="UAH" rate="7.988582" comment="UKRAINE Hryvnia" />
|
||||
<rate from="USD" to="GBP" rate="0.647910" comment="UNITED KINGDOM Pound" />
|
||||
|
||||
<!-- Cross-rates for some common currencies -->
|
||||
<rate from="EUR" to="GBP" rate="0.869914" />
|
||||
<rate from="EUR" to="NOK" rate="7.800095" />
|
||||
<rate from="GBP" to="NOK" rate="8.966508" />
|
||||
</rates>
|
||||
</currencyConfig>
|
||||
@@ -1,38 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<!-- If this file is found in the config directory, it will only be
|
||||
loaded once at startup. If it is found in Solr's data
|
||||
directory, it will be re-loaded every commit.
|
||||
|
||||
See http://wiki.apache.org/solr/QueryElevationComponent for more info
|
||||
|
||||
-->
|
||||
<elevate>
|
||||
<query text="foo bar">
|
||||
<doc id="1" />
|
||||
<doc id="2" />
|
||||
<doc id="3" />
|
||||
</query>
|
||||
|
||||
<query text="ipod">
|
||||
<doc id="MA147LL/A" /> <!-- put the actual ipod at the top -->
|
||||
<doc id="IW-02" exclude="true" /> <!-- exclude this cable -->
|
||||
</query>
|
||||
|
||||
</elevate>
|
||||
@@ -1,8 +0,0 @@
|
||||
# Set of Catalan contractions for ElisionFilter
|
||||
# TODO: load this as a resource from the analyzer and sync it in build.xml
|
||||
d
|
||||
l
|
||||
m
|
||||
n
|
||||
s
|
||||
t
|
||||
@@ -1,15 +0,0 @@
|
||||
# Set of French contractions for ElisionFilter
|
||||
# TODO: load this as a resource from the analyzer and sync it in build.xml
|
||||
l
|
||||
m
|
||||
t
|
||||
qu
|
||||
n
|
||||
s
|
||||
j
|
||||
d
|
||||
c
|
||||
jusqu
|
||||
quoiqu
|
||||
lorsqu
|
||||
puisqu
|
||||
@@ -1,5 +0,0 @@
|
||||
# Set of Irish contractions for ElisionFilter
|
||||
# TODO: load this as a resource from the analyzer and sync it in build.xml
|
||||
d
|
||||
m
|
||||
b
|
||||
@@ -1,23 +0,0 @@
|
||||
# Set of Italian contractions for ElisionFilter
|
||||
# TODO: load this as a resource from the analyzer and sync it in build.xml
|
||||
c
|
||||
l
|
||||
all
|
||||
dall
|
||||
dell
|
||||
nell
|
||||
sull
|
||||
coll
|
||||
pell
|
||||
gl
|
||||
agl
|
||||
dagl
|
||||
degl
|
||||
negl
|
||||
sugl
|
||||
un
|
||||
m
|
||||
t
|
||||
s
|
||||
v
|
||||
d
|
||||
@@ -1,5 +0,0 @@
|
||||
# Set of Irish hyphenations for StopFilter
|
||||
# TODO: load this as a resource from the analyzer and sync it in build.xml
|
||||
h
|
||||
n
|
||||
t
|
||||
@@ -1,6 +0,0 @@
|
||||
# Set of overrides for the dutch stemmer
|
||||
# TODO: load this as a resource from the analyzer and sync it in build.xml
|
||||
fiets fiets
|
||||
bromfiets bromfiets
|
||||
ei eier
|
||||
kind kinder
|
||||
@@ -1,420 +0,0 @@
|
||||
#
|
||||
# This file defines a Japanese stoptag set for JapanesePartOfSpeechStopFilter.
|
||||
#
|
||||
# Any token with a part-of-speech tag that exactly matches those defined in this
|
||||
# file are removed from the token stream.
|
||||
#
|
||||
# Set your own stoptags by uncommenting the lines below. Note that comments are
|
||||
# not allowed on the same line as a stoptag. See LUCENE-3745 for frequency lists,
|
||||
# etc. that can be useful for building you own stoptag set.
|
||||
#
|
||||
# The entire possible tagset is provided below for convenience.
|
||||
#
|
||||
#####
|
||||
# noun: unclassified nouns
|
||||
#名詞
|
||||
#
|
||||
# noun-common: Common nouns or nouns where the sub-classification is undefined
|
||||
#名詞-一般
|
||||
#
|
||||
# noun-proper: Proper nouns where the sub-classification is undefined
|
||||
#名詞-固有名詞
|
||||
#
|
||||
# noun-proper-misc: miscellaneous proper nouns
|
||||
#名詞-固有名詞-一般
|
||||
#
|
||||
# noun-proper-person: Personal names where the sub-classification is undefined
|
||||
#名詞-固有名詞-人名
|
||||
#
|
||||
# noun-proper-person-misc: names that cannot be divided into surname and
|
||||
# given name; foreign names; names where the surname or given name is unknown.
|
||||
# e.g. お市の方
|
||||
#名詞-固有名詞-人名-一般
|
||||
#
|
||||
# noun-proper-person-surname: Mainly Japanese surnames.
|
||||
# e.g. 山田
|
||||
#名詞-固有名詞-人名-姓
|
||||
#
|
||||
# noun-proper-person-given_name: Mainly Japanese given names.
|
||||
# e.g. 太郎
|
||||
#名詞-固有名詞-人名-名
|
||||
#
|
||||
# noun-proper-organization: Names representing organizations.
|
||||
# e.g. 通産省, NHK
|
||||
#名詞-固有名詞-組織
|
||||
#
|
||||
# noun-proper-place: Place names where the sub-classification is undefined
|
||||
#名詞-固有名詞-地域
|
||||
#
|
||||
# noun-proper-place-misc: Place names excluding countries.
|
||||
# e.g. アジア, バルセロナ, 京都
|
||||
#名詞-固有名詞-地域-一般
|
||||
#
|
||||
# noun-proper-place-country: Country names.
|
||||
# e.g. 日本, オーストラリア
|
||||
#名詞-固有名詞-地域-国
|
||||
#
|
||||
# noun-pronoun: Pronouns where the sub-classification is undefined
|
||||
#名詞-代名詞
|
||||
#
|
||||
# noun-pronoun-misc: miscellaneous pronouns:
|
||||
# e.g. それ, ここ, あいつ, あなた, あちこち, いくつ, どこか, なに, みなさん, みんな, わたくし, われわれ
|
||||
#名詞-代名詞-一般
|
||||
#
|
||||
# noun-pronoun-contraction: Spoken language contraction made by combining a
|
||||
# pronoun and the particle 'wa'.
|
||||
# e.g. ありゃ, こりゃ, こりゃあ, そりゃ, そりゃあ
|
||||
#名詞-代名詞-縮約
|
||||
#
|
||||
# noun-adverbial: Temporal nouns such as names of days or months that behave
|
||||
# like adverbs. Nouns that represent amount or ratios and can be used adverbially,
|
||||
# e.g. 金曜, 一月, 午後, 少量
|
||||
#名詞-副詞可能
|
||||
#
|
||||
# noun-verbal: Nouns that take arguments with case and can appear followed by
|
||||
# 'suru' and related verbs (する, できる, なさる, くださる)
|
||||
# e.g. インプット, 愛着, 悪化, 悪戦苦闘, 一安心, 下取り
|
||||
#名詞-サ変接続
|
||||
#
|
||||
# noun-adjective-base: The base form of adjectives, words that appear before な ("na")
|
||||
# e.g. 健康, 安易, 駄目, だめ
|
||||
#名詞-形容動詞語幹
|
||||
#
|
||||
# noun-numeric: Arabic numbers, Chinese numerals, and counters like 何 (回), 数.
|
||||
# e.g. 0, 1, 2, 何, 数, 幾
|
||||
#名詞-数
|
||||
#
|
||||
# noun-affix: noun affixes where the sub-classification is undefined
|
||||
#名詞-非自立
|
||||
#
|
||||
# noun-affix-misc: Of adnominalizers, the case-marker の ("no"), and words that
|
||||
# attach to the base form of inflectional words, words that cannot be classified
|
||||
# into any of the other categories below. This category includes indefinite nouns.
|
||||
# e.g. あかつき, 暁, かい, 甲斐, 気, きらい, 嫌い, くせ, 癖, こと, 事, ごと, 毎, しだい, 次第,
|
||||
# 順, せい, 所為, ついで, 序で, つもり, 積もり, 点, どころ, の, はず, 筈, はずみ, 弾み,
|
||||
# 拍子, ふう, ふり, 振り, ほう, 方, 旨, もの, 物, 者, ゆえ, 故, ゆえん, 所以, わけ, 訳,
|
||||
# わり, 割り, 割, ん-口語/, もん-口語/
|
||||
#名詞-非自立-一般
|
||||
#
|
||||
# noun-affix-adverbial: noun affixes that that can behave as adverbs.
|
||||
# e.g. あいだ, 間, あげく, 挙げ句, あと, 後, 余り, 以外, 以降, 以後, 以上, 以前, 一方, うえ,
|
||||
# 上, うち, 内, おり, 折り, かぎり, 限り, きり, っきり, 結果, ころ, 頃, さい, 際, 最中, さなか,
|
||||
# 最中, じたい, 自体, たび, 度, ため, 為, つど, 都度, とおり, 通り, とき, 時, ところ, 所,
|
||||
# とたん, 途端, なか, 中, のち, 後, ばあい, 場合, 日, ぶん, 分, ほか, 他, まえ, 前, まま,
|
||||
# 儘, 侭, みぎり, 矢先
|
||||
#名詞-非自立-副詞可能
|
||||
#
|
||||
# noun-affix-aux: noun affixes treated as 助動詞 ("auxiliary verb") in school grammars
|
||||
# with the stem よう(だ) ("you(da)").
|
||||
# e.g. よう, やう, 様 (よう)
|
||||
#名詞-非自立-助動詞語幹
|
||||
#
|
||||
# noun-affix-adjective-base: noun affixes that can connect to the indeclinable
|
||||
# connection form な (aux "da").
|
||||
# e.g. みたい, ふう
|
||||
#名詞-非自立-形容動詞語幹
|
||||
#
|
||||
# noun-special: special nouns where the sub-classification is undefined.
|
||||
#名詞-特殊
|
||||
#
|
||||
# noun-special-aux: The そうだ ("souda") stem form that is used for reporting news, is
|
||||
# treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the base
|
||||
# form of inflectional words.
|
||||
# e.g. そう
|
||||
#名詞-特殊-助動詞語幹
|
||||
#
|
||||
# noun-suffix: noun suffixes where the sub-classification is undefined.
|
||||
#名詞-接尾
|
||||
#
|
||||
# noun-suffix-misc: Of the nouns or stem forms of other parts of speech that connect
|
||||
# to ガル or タイ and can combine into compound nouns, words that cannot be classified into
|
||||
# any of the other categories below. In general, this category is more inclusive than
|
||||
# 接尾語 ("suffix") and is usually the last element in a compound noun.
|
||||
# e.g. おき, かた, 方, 甲斐 (がい), がかり, ぎみ, 気味, ぐるみ, (~した) さ, 次第, 済 (ず) み,
|
||||
# よう, (でき)っこ, 感, 観, 性, 学, 類, 面, 用
|
||||
#名詞-接尾-一般
|
||||
#
|
||||
# noun-suffix-person: Suffixes that form nouns and attach to person names more often
|
||||
# than other nouns.
|
||||
# e.g. 君, 様, 著
|
||||
#名詞-接尾-人名
|
||||
#
|
||||
# noun-suffix-place: Suffixes that form nouns and attach to place names more often
|
||||
# than other nouns.
|
||||
# e.g. 町, 市, 県
|
||||
#名詞-接尾-地域
|
||||
#
|
||||
# noun-suffix-verbal: Of the suffixes that attach to nouns and form nouns, those that
|
||||
# can appear before スル ("suru").
|
||||
# e.g. 化, 視, 分け, 入り, 落ち, 買い
|
||||
#名詞-接尾-サ変接続
|
||||
#
|
||||
# noun-suffix-aux: The stem form of そうだ (様態) that is used to indicate conditions,
|
||||
# is treated as 助動詞 ("auxiliary verb") in school grammars, and attach to the
|
||||
# conjunctive form of inflectional words.
|
||||
# e.g. そう
|
||||
#名詞-接尾-助動詞語幹
|
||||
#
|
||||
# noun-suffix-adjective-base: Suffixes that attach to other nouns or the conjunctive
|
||||
# form of inflectional words and appear before the copula だ ("da").
|
||||
# e.g. 的, げ, がち
|
||||
#名詞-接尾-形容動詞語幹
|
||||
#
|
||||
# noun-suffix-adverbial: Suffixes that attach to other nouns and can behave as adverbs.
|
||||
# e.g. 後 (ご), 以後, 以降, 以前, 前後, 中, 末, 上, 時 (じ)
|
||||
#名詞-接尾-副詞可能
|
||||
#
|
||||
# noun-suffix-classifier: Suffixes that attach to numbers and form nouns. This category
|
||||
# is more inclusive than 助数詞 ("classifier") and includes common nouns that attach
|
||||
# to numbers.
|
||||
# e.g. 個, つ, 本, 冊, パーセント, cm, kg, カ月, か国, 区画, 時間, 時半
|
||||
#名詞-接尾-助数詞
|
||||
#
|
||||
# noun-suffix-special: Special suffixes that mainly attach to inflecting words.
|
||||
# e.g. (楽し) さ, (考え) 方
|
||||
#名詞-接尾-特殊
|
||||
#
|
||||
# noun-suffix-conjunctive: Nouns that behave like conjunctions and join two words
|
||||
# together.
|
||||
# e.g. (日本) 対 (アメリカ), 対 (アメリカ), (3) 対 (5), (女優) 兼 (主婦)
|
||||
#名詞-接続詞的
|
||||
#
|
||||
# noun-verbal_aux: Nouns that attach to the conjunctive particle て ("te") and are
|
||||
# semantically verb-like.
|
||||
# e.g. ごらん, ご覧, 御覧, 頂戴
|
||||
#名詞-動詞非自立的
|
||||
#
|
||||
# noun-quotation: text that cannot be segmented into words, proverbs, Chinese poetry,
|
||||
# dialects, English, etc. Currently, the only entry for 名詞 引用文字列 ("noun quotation")
|
||||
# is いわく ("iwaku").
|
||||
#名詞-引用文字列
|
||||
#
|
||||
# noun-nai_adjective: Words that appear before the auxiliary verb ない ("nai") and
|
||||
# behave like an adjective.
|
||||
# e.g. 申し訳, 仕方, とんでも, 違い
|
||||
#名詞-ナイ形容詞語幹
|
||||
#
|
||||
#####
|
||||
# prefix: unclassified prefixes
|
||||
#接頭詞
|
||||
#
|
||||
# prefix-nominal: Prefixes that attach to nouns (including adjective stem forms)
|
||||
# excluding numerical expressions.
|
||||
# e.g. お (水), 某 (氏), 同 (社), 故 (~氏), 高 (品質), お (見事), ご (立派)
|
||||
#接頭詞-名詞接続
|
||||
#
|
||||
# prefix-verbal: Prefixes that attach to the imperative form of a verb or a verb
|
||||
# in conjunctive form followed by なる/なさる/くださる.
|
||||
# e.g. お (読みなさい), お (座り)
|
||||
#接頭詞-動詞接続
|
||||
#
|
||||
# prefix-adjectival: Prefixes that attach to adjectives.
|
||||
# e.g. お (寒いですねえ), バカ (でかい)
|
||||
#接頭詞-形容詞接続
|
||||
#
|
||||
# prefix-numerical: Prefixes that attach to numerical expressions.
|
||||
# e.g. 約, およそ, 毎時
|
||||
#接頭詞-数接続
|
||||
#
|
||||
#####
|
||||
# verb: unclassified verbs
|
||||
#動詞
|
||||
#
|
||||
# verb-main:
|
||||
#動詞-自立
|
||||
#
|
||||
# verb-auxiliary:
|
||||
#動詞-非自立
|
||||
#
|
||||
# verb-suffix:
|
||||
#動詞-接尾
|
||||
#
|
||||
#####
|
||||
# adjective: unclassified adjectives
|
||||
#形容詞
|
||||
#
|
||||
# adjective-main:
|
||||
#形容詞-自立
|
||||
#
|
||||
# adjective-auxiliary:
|
||||
#形容詞-非自立
|
||||
#
|
||||
# adjective-suffix:
|
||||
#形容詞-接尾
|
||||
#
|
||||
#####
|
||||
# adverb: unclassified adverbs
|
||||
#副詞
|
||||
#
|
||||
# adverb-misc: Words that can be segmented into one unit and where adnominal
|
||||
# modification is not possible.
|
||||
# e.g. あいかわらず, 多分
|
||||
#副詞-一般
|
||||
#
|
||||
# adverb-particle_conjunction: Adverbs that can be followed by の, は, に,
|
||||
# な, する, だ, etc.
|
||||
# e.g. こんなに, そんなに, あんなに, なにか, なんでも
|
||||
#副詞-助詞類接続
|
||||
#
|
||||
#####
|
||||
# adnominal: Words that only have noun-modifying forms.
|
||||
# e.g. この, その, あの, どの, いわゆる, なんらかの, 何らかの, いろんな, こういう, そういう, ああいう,
|
||||
# どういう, こんな, そんな, あんな, どんな, 大きな, 小さな, おかしな, ほんの, たいした,
|
||||
# 「(, も) さる (ことながら)」, 微々たる, 堂々たる, 単なる, いかなる, 我が」「同じ, 亡き
|
||||
#連体詞
|
||||
#
|
||||
#####
|
||||
# conjunction: Conjunctions that can occur independently.
|
||||
# e.g. が, けれども, そして, じゃあ, それどころか
|
||||
接続詞
|
||||
#
|
||||
#####
|
||||
# particle: unclassified particles.
|
||||
助詞
|
||||
#
|
||||
# particle-case: case particles where the subclassification is undefined.
|
||||
助詞-格助詞
|
||||
#
|
||||
# particle-case-misc: Case particles.
|
||||
# e.g. から, が, で, と, に, へ, より, を, の, にて
|
||||
助詞-格助詞-一般
|
||||
#
|
||||
# particle-case-quote: the "to" that appears after nouns, a person’s speech,
|
||||
# quotation marks, expressions of decisions from a meeting, reasons, judgements,
|
||||
# conjectures, etc.
|
||||
# e.g. ( だ) と (述べた.), ( である) と (して執行猶予...)
|
||||
助詞-格助詞-引用
|
||||
#
|
||||
# particle-case-compound: Compounds of particles and verbs that mainly behave
|
||||
# like case particles.
|
||||
# e.g. という, といった, とかいう, として, とともに, と共に, でもって, にあたって, に当たって, に当って,
|
||||
# にあたり, に当たり, に当り, に当たる, にあたる, において, に於いて,に於て, における, に於ける,
|
||||
# にかけ, にかけて, にかんし, に関し, にかんして, に関して, にかんする, に関する, に際し,
|
||||
# に際して, にしたがい, に従い, に従う, にしたがって, に従って, にたいし, に対し, にたいして,
|
||||
# に対して, にたいする, に対する, について, につき, につけ, につけて, につれ, につれて, にとって,
|
||||
# にとり, にまつわる, によって, に依って, に因って, により, に依り, に因り, による, に依る, に因る,
|
||||
# にわたって, にわたる, をもって, を以って, を通じ, を通じて, を通して, をめぐって, をめぐり, をめぐる,
|
||||
# って-口語/, ちゅう-関西弁「という」/, (何) ていう (人)-口語/, っていう-口語/, といふ, とかいふ
|
||||
助詞-格助詞-連語
|
||||
#
|
||||
# particle-conjunctive:
|
||||
# e.g. から, からには, が, けれど, けれども, けど, し, つつ, て, で, と, ところが, どころか, とも, ども,
|
||||
# ながら, なり, ので, のに, ば, ものの, や ( した), やいなや, (ころん) じゃ(いけない)-口語/,
|
||||
# (行っ) ちゃ(いけない)-口語/, (言っ) たって (しかたがない)-口語/, (それがなく)ったって (平気)-口語/
|
||||
助詞-接続助詞
|
||||
#
|
||||
# particle-dependency:
|
||||
# e.g. こそ, さえ, しか, すら, は, も, ぞ
|
||||
助詞-係助詞
|
||||
#
|
||||
# particle-adverbial:
|
||||
# e.g. がてら, かも, くらい, 位, ぐらい, しも, (学校) じゃ(これが流行っている)-口語/,
|
||||
# (それ)じゃあ (よくない)-口語/, ずつ, (私) なぞ, など, (私) なり (に), (先生) なんか (大嫌い)-口語/,
|
||||
# (私) なんぞ, (先生) なんて (大嫌い)-口語/, のみ, だけ, (私) だって-口語/, だに,
|
||||
# (彼)ったら-口語/, (お茶) でも (いかが), 等 (とう), (今後) とも, ばかり, ばっか-口語/, ばっかり-口語/,
|
||||
# ほど, 程, まで, 迄, (誰) も (が)([助詞-格助詞] および [助詞-係助詞] の前に位置する「も」)
|
||||
助詞-副助詞
|
||||
#
|
||||
# particle-interjective: particles with interjective grammatical roles.
|
||||
# e.g. (松島) や
|
||||
助詞-間投助詞
|
||||
#
|
||||
# particle-coordinate:
|
||||
# e.g. と, たり, だの, だり, とか, なり, や, やら
|
||||
助詞-並立助詞
|
||||
#
|
||||
# particle-final:
|
||||
# e.g. かい, かしら, さ, ぜ, (だ)っけ-口語/, (とまってる) で-方言/, な, ナ, なあ-口語/, ぞ, ね, ネ,
|
||||
# ねぇ-口語/, ねえ-口語/, ねん-方言/, の, のう-口語/, や, よ, ヨ, よぉ-口語/, わ, わい-口語/
|
||||
助詞-終助詞
|
||||
#
|
||||
# particle-adverbial/conjunctive/final: The particle "ka" when unknown whether it is
|
||||
# adverbial, conjunctive, or sentence final. For example:
|
||||
# (a) 「A か B か」. Ex:「(国内で運用する) か,(海外で運用する) か (.)」
|
||||
# (b) Inside an adverb phrase. Ex:「(幸いという) か (, 死者はいなかった.)」
|
||||
# 「(祈りが届いたせい) か (, 試験に合格した.)」
|
||||
# (c) 「かのように」. Ex:「(何もなかった) か (のように振る舞った.)」
|
||||
# e.g. か
|
||||
助詞-副助詞/並立助詞/終助詞
|
||||
#
|
||||
# particle-adnominalizer: The "no" that attaches to nouns and modifies
|
||||
# non-inflectional words.
|
||||
助詞-連体化
|
||||
#
|
||||
# particle-adnominalizer: The "ni" and "to" that appear following nouns and adverbs
|
||||
# that are giongo, giseigo, or gitaigo.
|
||||
# e.g. に, と
|
||||
助詞-副詞化
|
||||
#
|
||||
# particle-special: A particle that does not fit into one of the above classifications.
|
||||
# This includes particles that are used in Tanka, Haiku, and other poetry.
|
||||
# e.g. かな, けむ, ( しただろう) に, (あんた) にゃ(わからん), (俺) ん (家)
|
||||
助詞-特殊
|
||||
#
|
||||
#####
|
||||
# auxiliary-verb:
|
||||
助動詞
|
||||
#
|
||||
#####
|
||||
# interjection: Greetings and other exclamations.
|
||||
# e.g. おはよう, おはようございます, こんにちは, こんばんは, ありがとう, どうもありがとう, ありがとうございます,
|
||||
# いただきます, ごちそうさま, さよなら, さようなら, はい, いいえ, ごめん, ごめんなさい
|
||||
#感動詞
|
||||
#
|
||||
#####
|
||||
# symbol: unclassified Symbols.
|
||||
記号
|
||||
#
|
||||
# symbol-misc: A general symbol not in one of the categories below.
|
||||
# e.g. [○◎@$〒→+]
|
||||
記号-一般
|
||||
#
|
||||
# symbol-comma: Commas
|
||||
# e.g. [,、]
|
||||
記号-読点
|
||||
#
|
||||
# symbol-period: Periods and full stops.
|
||||
# e.g. [..。]
|
||||
記号-句点
|
||||
#
|
||||
# symbol-space: Full-width whitespace.
|
||||
記号-空白
|
||||
#
|
||||
# symbol-open_bracket:
|
||||
# e.g. [({‘“『【]
|
||||
記号-括弧開
|
||||
#
|
||||
# symbol-close_bracket:
|
||||
# e.g. [)}’”』」】]
|
||||
記号-括弧閉
|
||||
#
|
||||
# symbol-alphabetic:
|
||||
#記号-アルファベット
|
||||
#
|
||||
#####
|
||||
# other: unclassified other
|
||||
#その他
|
||||
#
|
||||
# other-interjection: Words that are hard to classify as noun-suffixes or
|
||||
# sentence-final particles.
|
||||
# e.g. (だ)ァ
|
||||
その他-間投
|
||||
#
|
||||
#####
|
||||
# filler: Aizuchi that occurs during a conversation or sounds inserted as filler.
|
||||
# e.g. あの, うんと, えと
|
||||
フィラー
|
||||
#
|
||||
#####
|
||||
# non-verbal: non-verbal sound.
|
||||
非言語音
|
||||
#
|
||||
#####
|
||||
# fragment:
|
||||
#語断片
|
||||
#
|
||||
#####
|
||||
# unknown: unknown part of speech.
|
||||
#未知語
|
||||
#
|
||||
##### End of file
|
||||
@@ -1,125 +0,0 @@
|
||||
# This file was created by Jacques Savoy and is distributed under the BSD license.
|
||||
# See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
# Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
# Cleaned on October 11, 2009 (not normalized, so use before normalization)
|
||||
# This means that when modifying this list, you might need to add some
|
||||
# redundant entries, for example containing forms with both أ and ا
|
||||
من
|
||||
ومن
|
||||
منها
|
||||
منه
|
||||
في
|
||||
وفي
|
||||
فيها
|
||||
فيه
|
||||
و
|
||||
ف
|
||||
ثم
|
||||
او
|
||||
أو
|
||||
ب
|
||||
بها
|
||||
به
|
||||
ا
|
||||
أ
|
||||
اى
|
||||
اي
|
||||
أي
|
||||
أى
|
||||
لا
|
||||
ولا
|
||||
الا
|
||||
ألا
|
||||
إلا
|
||||
لكن
|
||||
ما
|
||||
وما
|
||||
كما
|
||||
فما
|
||||
عن
|
||||
مع
|
||||
اذا
|
||||
إذا
|
||||
ان
|
||||
أن
|
||||
إن
|
||||
انها
|
||||
أنها
|
||||
إنها
|
||||
انه
|
||||
أنه
|
||||
إنه
|
||||
بان
|
||||
بأن
|
||||
فان
|
||||
فأن
|
||||
وان
|
||||
وأن
|
||||
وإن
|
||||
التى
|
||||
التي
|
||||
الذى
|
||||
الذي
|
||||
الذين
|
||||
الى
|
||||
الي
|
||||
إلى
|
||||
إلي
|
||||
على
|
||||
عليها
|
||||
عليه
|
||||
اما
|
||||
أما
|
||||
إما
|
||||
ايضا
|
||||
أيضا
|
||||
كل
|
||||
وكل
|
||||
لم
|
||||
ولم
|
||||
لن
|
||||
ولن
|
||||
هى
|
||||
هي
|
||||
هو
|
||||
وهى
|
||||
وهي
|
||||
وهو
|
||||
فهى
|
||||
فهي
|
||||
فهو
|
||||
انت
|
||||
أنت
|
||||
لك
|
||||
لها
|
||||
له
|
||||
هذه
|
||||
هذا
|
||||
تلك
|
||||
ذلك
|
||||
هناك
|
||||
كانت
|
||||
كان
|
||||
يكون
|
||||
تكون
|
||||
وكانت
|
||||
وكان
|
||||
غير
|
||||
بعض
|
||||
قد
|
||||
نحو
|
||||
بين
|
||||
بينما
|
||||
منذ
|
||||
ضمن
|
||||
حيث
|
||||
الان
|
||||
الآن
|
||||
خلال
|
||||
بعد
|
||||
قبل
|
||||
حتى
|
||||
عند
|
||||
عندما
|
||||
لدى
|
||||
جميع
|
||||
@@ -1,193 +0,0 @@
|
||||
# This file was created by Jacques Savoy and is distributed under the BSD license.
|
||||
# See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
# Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
а
|
||||
аз
|
||||
ако
|
||||
ала
|
||||
бе
|
||||
без
|
||||
беше
|
||||
би
|
||||
бил
|
||||
била
|
||||
били
|
||||
било
|
||||
близо
|
||||
бъдат
|
||||
бъде
|
||||
бяха
|
||||
в
|
||||
вас
|
||||
ваш
|
||||
ваша
|
||||
вероятно
|
||||
вече
|
||||
взема
|
||||
ви
|
||||
вие
|
||||
винаги
|
||||
все
|
||||
всеки
|
||||
всички
|
||||
всичко
|
||||
всяка
|
||||
във
|
||||
въпреки
|
||||
върху
|
||||
г
|
||||
ги
|
||||
главно
|
||||
го
|
||||
д
|
||||
да
|
||||
дали
|
||||
до
|
||||
докато
|
||||
докога
|
||||
дори
|
||||
досега
|
||||
доста
|
||||
е
|
||||
едва
|
||||
един
|
||||
ето
|
||||
за
|
||||
зад
|
||||
заедно
|
||||
заради
|
||||
засега
|
||||
затова
|
||||
защо
|
||||
защото
|
||||
и
|
||||
из
|
||||
или
|
||||
им
|
||||
има
|
||||
имат
|
||||
иска
|
||||
й
|
||||
каза
|
||||
как
|
||||
каква
|
||||
какво
|
||||
както
|
||||
какъв
|
||||
като
|
||||
кога
|
||||
когато
|
||||
което
|
||||
които
|
||||
кой
|
||||
който
|
||||
колко
|
||||
която
|
||||
къде
|
||||
където
|
||||
към
|
||||
ли
|
||||
м
|
||||
ме
|
||||
между
|
||||
мен
|
||||
ми
|
||||
мнозина
|
||||
мога
|
||||
могат
|
||||
може
|
||||
моля
|
||||
момента
|
||||
му
|
||||
н
|
||||
на
|
||||
над
|
||||
назад
|
||||
най
|
||||
направи
|
||||
напред
|
||||
например
|
||||
нас
|
||||
не
|
||||
него
|
||||
нея
|
||||
ни
|
||||
ние
|
||||
никой
|
||||
нито
|
||||
но
|
||||
някои
|
||||
някой
|
||||
няма
|
||||
обаче
|
||||
около
|
||||
освен
|
||||
особено
|
||||
от
|
||||
отгоре
|
||||
отново
|
||||
още
|
||||
пак
|
||||
по
|
||||
повече
|
||||
повечето
|
||||
под
|
||||
поне
|
||||
поради
|
||||
после
|
||||
почти
|
||||
прави
|
||||
пред
|
||||
преди
|
||||
през
|
||||
при
|
||||
пък
|
||||
първо
|
||||
с
|
||||
са
|
||||
само
|
||||
се
|
||||
сега
|
||||
си
|
||||
скоро
|
||||
след
|
||||
сме
|
||||
според
|
||||
сред
|
||||
срещу
|
||||
сте
|
||||
съм
|
||||
със
|
||||
също
|
||||
т
|
||||
тази
|
||||
така
|
||||
такива
|
||||
такъв
|
||||
там
|
||||
твой
|
||||
те
|
||||
тези
|
||||
ти
|
||||
тн
|
||||
то
|
||||
това
|
||||
тогава
|
||||
този
|
||||
той
|
||||
толкова
|
||||
точно
|
||||
трябва
|
||||
тук
|
||||
тъй
|
||||
тя
|
||||
тях
|
||||
у
|
||||
харесва
|
||||
ч
|
||||
че
|
||||
често
|
||||
чрез
|
||||
ще
|
||||
щом
|
||||
я
|
||||
@@ -1,220 +0,0 @@
|
||||
# Catalan stopwords from http://github.com/vcl/cue.language (Apache 2 Licensed)
|
||||
a
|
||||
abans
|
||||
ací
|
||||
ah
|
||||
així
|
||||
això
|
||||
al
|
||||
als
|
||||
aleshores
|
||||
algun
|
||||
alguna
|
||||
algunes
|
||||
alguns
|
||||
alhora
|
||||
allà
|
||||
allí
|
||||
allò
|
||||
altra
|
||||
altre
|
||||
altres
|
||||
amb
|
||||
ambdós
|
||||
ambdues
|
||||
apa
|
||||
aquell
|
||||
aquella
|
||||
aquelles
|
||||
aquells
|
||||
aquest
|
||||
aquesta
|
||||
aquestes
|
||||
aquests
|
||||
aquí
|
||||
baix
|
||||
cada
|
||||
cadascú
|
||||
cadascuna
|
||||
cadascunes
|
||||
cadascuns
|
||||
com
|
||||
contra
|
||||
d'un
|
||||
d'una
|
||||
d'unes
|
||||
d'uns
|
||||
dalt
|
||||
de
|
||||
del
|
||||
dels
|
||||
des
|
||||
després
|
||||
dins
|
||||
dintre
|
||||
donat
|
||||
doncs
|
||||
durant
|
||||
e
|
||||
eh
|
||||
el
|
||||
els
|
||||
em
|
||||
en
|
||||
encara
|
||||
ens
|
||||
entre
|
||||
érem
|
||||
eren
|
||||
éreu
|
||||
es
|
||||
és
|
||||
esta
|
||||
està
|
||||
estàvem
|
||||
estaven
|
||||
estàveu
|
||||
esteu
|
||||
et
|
||||
etc
|
||||
ets
|
||||
fins
|
||||
fora
|
||||
gairebé
|
||||
ha
|
||||
han
|
||||
has
|
||||
havia
|
||||
he
|
||||
hem
|
||||
heu
|
||||
hi
|
||||
ho
|
||||
i
|
||||
igual
|
||||
iguals
|
||||
ja
|
||||
l'hi
|
||||
la
|
||||
les
|
||||
li
|
||||
li'n
|
||||
llavors
|
||||
m'he
|
||||
ma
|
||||
mal
|
||||
malgrat
|
||||
mateix
|
||||
mateixa
|
||||
mateixes
|
||||
mateixos
|
||||
me
|
||||
mentre
|
||||
més
|
||||
meu
|
||||
meus
|
||||
meva
|
||||
meves
|
||||
molt
|
||||
molta
|
||||
moltes
|
||||
molts
|
||||
mon
|
||||
mons
|
||||
n'he
|
||||
n'hi
|
||||
ne
|
||||
ni
|
||||
no
|
||||
nogensmenys
|
||||
només
|
||||
nosaltres
|
||||
nostra
|
||||
nostre
|
||||
nostres
|
||||
o
|
||||
oh
|
||||
oi
|
||||
on
|
||||
pas
|
||||
pel
|
||||
pels
|
||||
per
|
||||
però
|
||||
perquè
|
||||
poc
|
||||
poca
|
||||
pocs
|
||||
poques
|
||||
potser
|
||||
propi
|
||||
qual
|
||||
quals
|
||||
quan
|
||||
quant
|
||||
que
|
||||
què
|
||||
quelcom
|
||||
qui
|
||||
quin
|
||||
quina
|
||||
quines
|
||||
quins
|
||||
s'ha
|
||||
s'han
|
||||
sa
|
||||
semblant
|
||||
semblants
|
||||
ses
|
||||
seu
|
||||
seus
|
||||
seva
|
||||
seva
|
||||
seves
|
||||
si
|
||||
sobre
|
||||
sobretot
|
||||
sóc
|
||||
solament
|
||||
sols
|
||||
son
|
||||
són
|
||||
sons
|
||||
sota
|
||||
sou
|
||||
t'ha
|
||||
t'han
|
||||
t'he
|
||||
ta
|
||||
tal
|
||||
també
|
||||
tampoc
|
||||
tan
|
||||
tant
|
||||
tanta
|
||||
tantes
|
||||
teu
|
||||
teus
|
||||
teva
|
||||
teves
|
||||
ton
|
||||
tons
|
||||
tot
|
||||
tota
|
||||
totes
|
||||
tots
|
||||
un
|
||||
una
|
||||
unes
|
||||
uns
|
||||
us
|
||||
va
|
||||
vaig
|
||||
vam
|
||||
van
|
||||
vas
|
||||
veu
|
||||
vosaltres
|
||||
vostra
|
||||
vostre
|
||||
vostres
|
||||
@@ -1,172 +0,0 @@
|
||||
a
|
||||
s
|
||||
k
|
||||
o
|
||||
i
|
||||
u
|
||||
v
|
||||
z
|
||||
dnes
|
||||
cz
|
||||
tímto
|
||||
budeš
|
||||
budem
|
||||
byli
|
||||
jseš
|
||||
můj
|
||||
svým
|
||||
ta
|
||||
tomto
|
||||
tohle
|
||||
tuto
|
||||
tyto
|
||||
jej
|
||||
zda
|
||||
proč
|
||||
máte
|
||||
tato
|
||||
kam
|
||||
tohoto
|
||||
kdo
|
||||
kteří
|
||||
mi
|
||||
nám
|
||||
tom
|
||||
tomuto
|
||||
mít
|
||||
nic
|
||||
proto
|
||||
kterou
|
||||
byla
|
||||
toho
|
||||
protože
|
||||
asi
|
||||
ho
|
||||
naši
|
||||
napište
|
||||
re
|
||||
což
|
||||
tím
|
||||
takže
|
||||
svých
|
||||
její
|
||||
svými
|
||||
jste
|
||||
aj
|
||||
tu
|
||||
tedy
|
||||
teto
|
||||
bylo
|
||||
kde
|
||||
ke
|
||||
pravé
|
||||
ji
|
||||
nad
|
||||
nejsou
|
||||
či
|
||||
pod
|
||||
téma
|
||||
mezi
|
||||
přes
|
||||
ty
|
||||
pak
|
||||
vám
|
||||
ani
|
||||
když
|
||||
však
|
||||
neg
|
||||
jsem
|
||||
tento
|
||||
článku
|
||||
články
|
||||
aby
|
||||
jsme
|
||||
před
|
||||
pta
|
||||
jejich
|
||||
byl
|
||||
ještě
|
||||
až
|
||||
bez
|
||||
také
|
||||
pouze
|
||||
první
|
||||
vaše
|
||||
která
|
||||
nás
|
||||
nový
|
||||
tipy
|
||||
pokud
|
||||
může
|
||||
strana
|
||||
jeho
|
||||
své
|
||||
jiné
|
||||
zprávy
|
||||
nové
|
||||
není
|
||||
vás
|
||||
jen
|
||||
podle
|
||||
zde
|
||||
už
|
||||
být
|
||||
více
|
||||
bude
|
||||
již
|
||||
než
|
||||
který
|
||||
by
|
||||
které
|
||||
co
|
||||
nebo
|
||||
ten
|
||||
tak
|
||||
má
|
||||
při
|
||||
od
|
||||
po
|
||||
jsou
|
||||
jak
|
||||
další
|
||||
ale
|
||||
si
|
||||
se
|
||||
ve
|
||||
to
|
||||
jako
|
||||
za
|
||||
zpět
|
||||
ze
|
||||
do
|
||||
pro
|
||||
je
|
||||
na
|
||||
atd
|
||||
atp
|
||||
jakmile
|
||||
přičemž
|
||||
já
|
||||
on
|
||||
ona
|
||||
ono
|
||||
oni
|
||||
ony
|
||||
my
|
||||
vy
|
||||
jí
|
||||
ji
|
||||
mě
|
||||
mne
|
||||
jemu
|
||||
tomu
|
||||
těm
|
||||
těmu
|
||||
němu
|
||||
němuž
|
||||
jehož
|
||||
jíž
|
||||
jelikož
|
||||
jež
|
||||
jakož
|
||||
načež
|
||||
@@ -1,108 +0,0 @@
|
||||
| From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
||||
| A Danish stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
| This is a ranked list (commonest to rarest) of stopwords derived from
|
||||
| a large text sample.
|
||||
|
||||
|
||||
og | and
|
||||
i | in
|
||||
jeg | I
|
||||
det | that (dem. pronoun)/it (pers. pronoun)
|
||||
at | that (in front of a sentence)/to (with infinitive)
|
||||
en | a/an
|
||||
den | it (pers. pronoun)/that (dem. pronoun)
|
||||
til | to/at/for/until/against/by/of/into, more
|
||||
er | present tense of "to be"
|
||||
som | who, as
|
||||
på | on/upon/in/on/at/to/after/of/with/for, on
|
||||
de | they
|
||||
med | with/by/in, along
|
||||
han | he
|
||||
af | of/by/from/off/for/in/with/on, off
|
||||
for | at/for/to/from/by/of/ago, in front/before, because
|
||||
ikke | not
|
||||
der | who/which, there/those
|
||||
var | past tense of "to be"
|
||||
mig | me/myself
|
||||
sig | oneself/himself/herself/itself/themselves
|
||||
men | but
|
||||
et | a/an/one, one (number), someone/somebody/one
|
||||
har | present tense of "to have"
|
||||
om | round/about/for/in/a, about/around/down, if
|
||||
vi | we
|
||||
min | my
|
||||
havde | past tense of "to have"
|
||||
ham | him
|
||||
hun | she
|
||||
nu | now
|
||||
over | over/above/across/by/beyond/past/on/about, over/past
|
||||
da | then, when/as/since
|
||||
fra | from/off/since, off, since
|
||||
du | you
|
||||
ud | out
|
||||
sin | his/her/its/one's
|
||||
dem | them
|
||||
os | us/ourselves
|
||||
op | up
|
||||
man | you/one
|
||||
hans | his
|
||||
hvor | where
|
||||
eller | or
|
||||
hvad | what
|
||||
skal | must/shall etc.
|
||||
selv | myself/youself/herself/ourselves etc., even
|
||||
her | here
|
||||
alle | all/everyone/everybody etc.
|
||||
vil | will (verb)
|
||||
blev | past tense of "to stay/to remain/to get/to become"
|
||||
kunne | could
|
||||
ind | in
|
||||
når | when
|
||||
være | present tense of "to be"
|
||||
dog | however/yet/after all
|
||||
noget | something
|
||||
ville | would
|
||||
jo | you know/you see (adv), yes
|
||||
deres | their/theirs
|
||||
efter | after/behind/according to/for/by/from, later/afterwards
|
||||
ned | down
|
||||
skulle | should
|
||||
denne | this
|
||||
end | than
|
||||
dette | this
|
||||
mit | my/mine
|
||||
også | also
|
||||
under | under/beneath/below/during, below/underneath
|
||||
have | have
|
||||
dig | you
|
||||
anden | other
|
||||
hende | her
|
||||
mine | my
|
||||
alt | everything
|
||||
meget | much/very, plenty of
|
||||
sit | his, her, its, one's
|
||||
sine | his, her, its, one's
|
||||
vor | our
|
||||
mod | against
|
||||
disse | these
|
||||
hvis | if
|
||||
din | your/yours
|
||||
nogle | some
|
||||
hos | by/at
|
||||
blive | be/become
|
||||
mange | many
|
||||
ad | by/through
|
||||
bliver | present tense of "to be/to become"
|
||||
hendes | her/hers
|
||||
været | be
|
||||
thi | for (conj)
|
||||
jer | you
|
||||
sådan | such, like this/like that
|
||||
@@ -1,292 +0,0 @@
|
||||
| From svn.tartarus.org/snowball/trunk/website/algorithms/german/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
||||
| A German stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
| The number of forms in this list is reduced significantly by passing it
|
||||
| through the German stemmer.
|
||||
|
||||
|
||||
aber | but
|
||||
|
||||
alle | all
|
||||
allem
|
||||
allen
|
||||
aller
|
||||
alles
|
||||
|
||||
als | than, as
|
||||
also | so
|
||||
am | an + dem
|
||||
an | at
|
||||
|
||||
ander | other
|
||||
andere
|
||||
anderem
|
||||
anderen
|
||||
anderer
|
||||
anderes
|
||||
anderm
|
||||
andern
|
||||
anderr
|
||||
anders
|
||||
|
||||
auch | also
|
||||
auf | on
|
||||
aus | out of
|
||||
bei | by
|
||||
bin | am
|
||||
bis | until
|
||||
bist | art
|
||||
da | there
|
||||
damit | with it
|
||||
dann | then
|
||||
|
||||
der | the
|
||||
den
|
||||
des
|
||||
dem
|
||||
die
|
||||
das
|
||||
|
||||
daß | that
|
||||
|
||||
derselbe | the same
|
||||
derselben
|
||||
denselben
|
||||
desselben
|
||||
demselben
|
||||
dieselbe
|
||||
dieselben
|
||||
dasselbe
|
||||
|
||||
dazu | to that
|
||||
|
||||
dein | thy
|
||||
deine
|
||||
deinem
|
||||
deinen
|
||||
deiner
|
||||
deines
|
||||
|
||||
denn | because
|
||||
|
||||
derer | of those
|
||||
dessen | of him
|
||||
|
||||
dich | thee
|
||||
dir | to thee
|
||||
du | thou
|
||||
|
||||
dies | this
|
||||
diese
|
||||
diesem
|
||||
diesen
|
||||
dieser
|
||||
dieses
|
||||
|
||||
|
||||
doch | (several meanings)
|
||||
dort | (over) there
|
||||
|
||||
|
||||
durch | through
|
||||
|
||||
ein | a
|
||||
eine
|
||||
einem
|
||||
einen
|
||||
einer
|
||||
eines
|
||||
|
||||
einig | some
|
||||
einige
|
||||
einigem
|
||||
einigen
|
||||
einiger
|
||||
einiges
|
||||
|
||||
einmal | once
|
||||
|
||||
er | he
|
||||
ihn | him
|
||||
ihm | to him
|
||||
|
||||
es | it
|
||||
etwas | something
|
||||
|
||||
euer | your
|
||||
eure
|
||||
eurem
|
||||
euren
|
||||
eurer
|
||||
eures
|
||||
|
||||
für | for
|
||||
gegen | towards
|
||||
gewesen | p.p. of sein
|
||||
hab | have
|
||||
habe | have
|
||||
haben | have
|
||||
hat | has
|
||||
hatte | had
|
||||
hatten | had
|
||||
hier | here
|
||||
hin | there
|
||||
hinter | behind
|
||||
|
||||
ich | I
|
||||
mich | me
|
||||
mir | to me
|
||||
|
||||
|
||||
ihr | you, to her
|
||||
ihre
|
||||
ihrem
|
||||
ihren
|
||||
ihrer
|
||||
ihres
|
||||
euch | to you
|
||||
|
||||
im | in + dem
|
||||
in | in
|
||||
indem | while
|
||||
ins | in + das
|
||||
ist | is
|
||||
|
||||
jede | each, every
|
||||
jedem
|
||||
jeden
|
||||
jeder
|
||||
jedes
|
||||
|
||||
jene | that
|
||||
jenem
|
||||
jenen
|
||||
jener
|
||||
jenes
|
||||
|
||||
jetzt | now
|
||||
kann | can
|
||||
|
||||
kein | no
|
||||
keine
|
||||
keinem
|
||||
keinen
|
||||
keiner
|
||||
keines
|
||||
|
||||
können | can
|
||||
könnte | could
|
||||
machen | do
|
||||
man | one
|
||||
|
||||
manche | some, many a
|
||||
manchem
|
||||
manchen
|
||||
mancher
|
||||
manches
|
||||
|
||||
mein | my
|
||||
meine
|
||||
meinem
|
||||
meinen
|
||||
meiner
|
||||
meines
|
||||
|
||||
mit | with
|
||||
muss | must
|
||||
musste | had to
|
||||
nach | to(wards)
|
||||
nicht | not
|
||||
nichts | nothing
|
||||
noch | still, yet
|
||||
nun | now
|
||||
nur | only
|
||||
ob | whether
|
||||
oder | or
|
||||
ohne | without
|
||||
sehr | very
|
||||
|
||||
sein | his
|
||||
seine
|
||||
seinem
|
||||
seinen
|
||||
seiner
|
||||
seines
|
||||
|
||||
selbst | self
|
||||
sich | herself
|
||||
|
||||
sie | they, she
|
||||
ihnen | to them
|
||||
|
||||
sind | are
|
||||
so | so
|
||||
|
||||
solche | such
|
||||
solchem
|
||||
solchen
|
||||
solcher
|
||||
solches
|
||||
|
||||
soll | shall
|
||||
sollte | should
|
||||
sondern | but
|
||||
sonst | else
|
||||
über | over
|
||||
um | about, around
|
||||
und | and
|
||||
|
||||
uns | us
|
||||
unse
|
||||
unsem
|
||||
unsen
|
||||
unser
|
||||
unses
|
||||
|
||||
unter | under
|
||||
viel | much
|
||||
vom | von + dem
|
||||
von | from
|
||||
vor | before
|
||||
während | while
|
||||
war | was
|
||||
waren | were
|
||||
warst | wast
|
||||
was | what
|
||||
weg | away, off
|
||||
weil | because
|
||||
weiter | further
|
||||
|
||||
welche | which
|
||||
welchem
|
||||
welchen
|
||||
welcher
|
||||
welches
|
||||
|
||||
wenn | when
|
||||
werde | will
|
||||
werden | will
|
||||
wie | how
|
||||
wieder | again
|
||||
will | want
|
||||
wir | we
|
||||
wird | will
|
||||
wirst | willst
|
||||
wo | where
|
||||
wollen | want
|
||||
wollte | wanted
|
||||
würde | would
|
||||
würden | would
|
||||
zu | to
|
||||
zum | zu + dem
|
||||
zur | zu + der
|
||||
zwar | indeed
|
||||
zwischen | between
|
||||
|
||||
@@ -1,78 +0,0 @@
|
||||
# Lucene Greek Stopwords list
|
||||
# Note: by default this file is used after GreekLowerCaseFilter,
|
||||
# so when modifying this file use 'σ' instead of 'ς'
|
||||
ο
|
||||
η
|
||||
το
|
||||
οι
|
||||
τα
|
||||
του
|
||||
τησ
|
||||
των
|
||||
τον
|
||||
την
|
||||
και
|
||||
κι
|
||||
κ
|
||||
ειμαι
|
||||
εισαι
|
||||
ειναι
|
||||
ειμαστε
|
||||
ειστε
|
||||
στο
|
||||
στον
|
||||
στη
|
||||
στην
|
||||
μα
|
||||
αλλα
|
||||
απο
|
||||
για
|
||||
προσ
|
||||
με
|
||||
σε
|
||||
ωσ
|
||||
παρα
|
||||
αντι
|
||||
κατα
|
||||
μετα
|
||||
θα
|
||||
να
|
||||
δε
|
||||
δεν
|
||||
μη
|
||||
μην
|
||||
επι
|
||||
ενω
|
||||
εαν
|
||||
αν
|
||||
τοτε
|
||||
που
|
||||
πωσ
|
||||
ποιοσ
|
||||
ποια
|
||||
ποιο
|
||||
ποιοι
|
||||
ποιεσ
|
||||
ποιων
|
||||
ποιουσ
|
||||
αυτοσ
|
||||
αυτη
|
||||
αυτο
|
||||
αυτοι
|
||||
αυτων
|
||||
αυτουσ
|
||||
αυτεσ
|
||||
αυτα
|
||||
εκεινοσ
|
||||
εκεινη
|
||||
εκεινο
|
||||
εκεινοι
|
||||
εκεινεσ
|
||||
εκεινα
|
||||
εκεινων
|
||||
εκεινουσ
|
||||
οπωσ
|
||||
ομωσ
|
||||
ισωσ
|
||||
οσο
|
||||
οτι
|
||||
@@ -1,54 +0,0 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# a couple of test stopwords to test that the words are really being
|
||||
# configured from this file:
|
||||
stopworda
|
||||
stopwordb
|
||||
|
||||
# Standard english stop words taken from Lucene's StopAnalyzer
|
||||
a
|
||||
an
|
||||
and
|
||||
are
|
||||
as
|
||||
at
|
||||
be
|
||||
but
|
||||
by
|
||||
for
|
||||
if
|
||||
in
|
||||
into
|
||||
is
|
||||
it
|
||||
no
|
||||
not
|
||||
of
|
||||
on
|
||||
or
|
||||
such
|
||||
that
|
||||
the
|
||||
their
|
||||
then
|
||||
there
|
||||
these
|
||||
they
|
||||
this
|
||||
to
|
||||
was
|
||||
will
|
||||
with
|
||||
@@ -1,354 +0,0 @@
|
||||
| From svn.tartarus.org/snowball/trunk/website/algorithms/spanish/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
||||
| A Spanish stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
|
||||
| The following is a ranked list (commonest to rarest) of stopwords
|
||||
| deriving from a large sample of text.
|
||||
|
||||
| Extra words have been added at the end.
|
||||
|
||||
de | from, of
|
||||
la | the, her
|
||||
que | who, that
|
||||
el | the
|
||||
en | in
|
||||
y | and
|
||||
a | to
|
||||
los | the, them
|
||||
del | de + el
|
||||
se | himself, from him etc
|
||||
las | the, them
|
||||
por | for, by, etc
|
||||
un | a
|
||||
para | for
|
||||
con | with
|
||||
no | no
|
||||
una | a
|
||||
su | his, her
|
||||
al | a + el
|
||||
| es from SER
|
||||
lo | him
|
||||
como | how
|
||||
más | more
|
||||
pero | pero
|
||||
sus | su plural
|
||||
le | to him, her
|
||||
ya | already
|
||||
o | or
|
||||
| fue from SER
|
||||
este | this
|
||||
| ha from HABER
|
||||
sí | himself etc
|
||||
porque | because
|
||||
esta | this
|
||||
| son from SER
|
||||
entre | between
|
||||
| está from ESTAR
|
||||
cuando | when
|
||||
muy | very
|
||||
sin | without
|
||||
sobre | on
|
||||
| ser from SER
|
||||
| tiene from TENER
|
||||
también | also
|
||||
me | me
|
||||
hasta | until
|
||||
hay | there is/are
|
||||
donde | where
|
||||
| han from HABER
|
||||
quien | whom, that
|
||||
| están from ESTAR
|
||||
| estado from ESTAR
|
||||
desde | from
|
||||
todo | all
|
||||
nos | us
|
||||
durante | during
|
||||
| estados from ESTAR
|
||||
todos | all
|
||||
uno | a
|
||||
les | to them
|
||||
ni | nor
|
||||
contra | against
|
||||
otros | other
|
||||
| fueron from SER
|
||||
ese | that
|
||||
eso | that
|
||||
| había from HABER
|
||||
ante | before
|
||||
ellos | they
|
||||
e | and (variant of y)
|
||||
esto | this
|
||||
mí | me
|
||||
antes | before
|
||||
algunos | some
|
||||
qué | what?
|
||||
unos | a
|
||||
yo | I
|
||||
otro | other
|
||||
otras | other
|
||||
otra | other
|
||||
él | he
|
||||
tanto | so much, many
|
||||
esa | that
|
||||
estos | these
|
||||
mucho | much, many
|
||||
quienes | who
|
||||
nada | nothing
|
||||
muchos | many
|
||||
cual | who
|
||||
| sea from SER
|
||||
poco | few
|
||||
ella | she
|
||||
estar | to be
|
||||
| haber from HABER
|
||||
estas | these
|
||||
| estaba from ESTAR
|
||||
| estamos from ESTAR
|
||||
algunas | some
|
||||
algo | something
|
||||
nosotros | we
|
||||
|
||||
| other forms
|
||||
|
||||
mi | me
|
||||
mis | mi plural
|
||||
tú | thou
|
||||
te | thee
|
||||
ti | thee
|
||||
tu | thy
|
||||
tus | tu plural
|
||||
ellas | they
|
||||
nosotras | we
|
||||
vosotros | you
|
||||
vosotras | you
|
||||
os | you
|
||||
mío | mine
|
||||
mía |
|
||||
míos |
|
||||
mías |
|
||||
tuyo | thine
|
||||
tuya |
|
||||
tuyos |
|
||||
tuyas |
|
||||
suyo | his, hers, theirs
|
||||
suya |
|
||||
suyos |
|
||||
suyas |
|
||||
nuestro | ours
|
||||
nuestra |
|
||||
nuestros |
|
||||
nuestras |
|
||||
vuestro | yours
|
||||
vuestra |
|
||||
vuestros |
|
||||
vuestras |
|
||||
esos | those
|
||||
esas | those
|
||||
|
||||
| forms of estar, to be (not including the infinitive):
|
||||
estoy
|
||||
estás
|
||||
está
|
||||
estamos
|
||||
estáis
|
||||
están
|
||||
esté
|
||||
estés
|
||||
estemos
|
||||
estéis
|
||||
estén
|
||||
estaré
|
||||
estarás
|
||||
estará
|
||||
estaremos
|
||||
estaréis
|
||||
estarán
|
||||
estaría
|
||||
estarías
|
||||
estaríamos
|
||||
estaríais
|
||||
estarían
|
||||
estaba
|
||||
estabas
|
||||
estábamos
|
||||
estabais
|
||||
estaban
|
||||
estuve
|
||||
estuviste
|
||||
estuvo
|
||||
estuvimos
|
||||
estuvisteis
|
||||
estuvieron
|
||||
estuviera
|
||||
estuvieras
|
||||
estuviéramos
|
||||
estuvierais
|
||||
estuvieran
|
||||
estuviese
|
||||
estuvieses
|
||||
estuviésemos
|
||||
estuvieseis
|
||||
estuviesen
|
||||
estando
|
||||
estado
|
||||
estada
|
||||
estados
|
||||
estadas
|
||||
estad
|
||||
|
||||
| forms of haber, to have (not including the infinitive):
|
||||
he
|
||||
has
|
||||
ha
|
||||
hemos
|
||||
habéis
|
||||
han
|
||||
haya
|
||||
hayas
|
||||
hayamos
|
||||
hayáis
|
||||
hayan
|
||||
habré
|
||||
habrás
|
||||
habrá
|
||||
habremos
|
||||
habréis
|
||||
habrán
|
||||
habría
|
||||
habrías
|
||||
habríamos
|
||||
habríais
|
||||
habrían
|
||||
había
|
||||
habías
|
||||
habíamos
|
||||
habíais
|
||||
habían
|
||||
hube
|
||||
hubiste
|
||||
hubo
|
||||
hubimos
|
||||
hubisteis
|
||||
hubieron
|
||||
hubiera
|
||||
hubieras
|
||||
hubiéramos
|
||||
hubierais
|
||||
hubieran
|
||||
hubiese
|
||||
hubieses
|
||||
hubiésemos
|
||||
hubieseis
|
||||
hubiesen
|
||||
habiendo
|
||||
habido
|
||||
habida
|
||||
habidos
|
||||
habidas
|
||||
|
||||
| forms of ser, to be (not including the infinitive):
|
||||
soy
|
||||
eres
|
||||
es
|
||||
somos
|
||||
sois
|
||||
son
|
||||
sea
|
||||
seas
|
||||
seamos
|
||||
seáis
|
||||
sean
|
||||
seré
|
||||
serás
|
||||
será
|
||||
seremos
|
||||
seréis
|
||||
serán
|
||||
sería
|
||||
serías
|
||||
seríamos
|
||||
seríais
|
||||
serían
|
||||
era
|
||||
eras
|
||||
éramos
|
||||
erais
|
||||
eran
|
||||
fui
|
||||
fuiste
|
||||
fue
|
||||
fuimos
|
||||
fuisteis
|
||||
fueron
|
||||
fuera
|
||||
fueras
|
||||
fuéramos
|
||||
fuerais
|
||||
fueran
|
||||
fuese
|
||||
fueses
|
||||
fuésemos
|
||||
fueseis
|
||||
fuesen
|
||||
siendo
|
||||
sido
|
||||
| sed also means 'thirst'
|
||||
|
||||
| forms of tener, to have (not including the infinitive):
|
||||
tengo
|
||||
tienes
|
||||
tiene
|
||||
tenemos
|
||||
tenéis
|
||||
tienen
|
||||
tenga
|
||||
tengas
|
||||
tengamos
|
||||
tengáis
|
||||
tengan
|
||||
tendré
|
||||
tendrás
|
||||
tendrá
|
||||
tendremos
|
||||
tendréis
|
||||
tendrán
|
||||
tendría
|
||||
tendrías
|
||||
tendríamos
|
||||
tendríais
|
||||
tendrían
|
||||
tenía
|
||||
tenías
|
||||
teníamos
|
||||
teníais
|
||||
tenían
|
||||
tuve
|
||||
tuviste
|
||||
tuvo
|
||||
tuvimos
|
||||
tuvisteis
|
||||
tuvieron
|
||||
tuviera
|
||||
tuvieras
|
||||
tuviéramos
|
||||
tuvierais
|
||||
tuvieran
|
||||
tuviese
|
||||
tuvieses
|
||||
tuviésemos
|
||||
tuvieseis
|
||||
tuviesen
|
||||
teniendo
|
||||
tenido
|
||||
tenida
|
||||
tenidos
|
||||
tenidas
|
||||
tened
|
||||
|
||||
@@ -1,99 +0,0 @@
|
||||
# example set of basque stopwords
|
||||
al
|
||||
anitz
|
||||
arabera
|
||||
asko
|
||||
baina
|
||||
bat
|
||||
batean
|
||||
batek
|
||||
bati
|
||||
batzuei
|
||||
batzuek
|
||||
batzuetan
|
||||
batzuk
|
||||
bera
|
||||
beraiek
|
||||
berau
|
||||
berauek
|
||||
bere
|
||||
berori
|
||||
beroriek
|
||||
beste
|
||||
bezala
|
||||
da
|
||||
dago
|
||||
dira
|
||||
ditu
|
||||
du
|
||||
dute
|
||||
edo
|
||||
egin
|
||||
ere
|
||||
eta
|
||||
eurak
|
||||
ez
|
||||
gainera
|
||||
gu
|
||||
gutxi
|
||||
guzti
|
||||
haiei
|
||||
haiek
|
||||
haietan
|
||||
hainbeste
|
||||
hala
|
||||
han
|
||||
handik
|
||||
hango
|
||||
hara
|
||||
hari
|
||||
hark
|
||||
hartan
|
||||
hau
|
||||
hauei
|
||||
hauek
|
||||
hauetan
|
||||
hemen
|
||||
hemendik
|
||||
hemengo
|
||||
hi
|
||||
hona
|
||||
honek
|
||||
honela
|
||||
honetan
|
||||
honi
|
||||
hor
|
||||
hori
|
||||
horiei
|
||||
horiek
|
||||
horietan
|
||||
horko
|
||||
horra
|
||||
horrek
|
||||
horrela
|
||||
horretan
|
||||
horri
|
||||
hortik
|
||||
hura
|
||||
izan
|
||||
ni
|
||||
noiz
|
||||
nola
|
||||
non
|
||||
nondik
|
||||
nongo
|
||||
nor
|
||||
nora
|
||||
ze
|
||||
zein
|
||||
zen
|
||||
zenbait
|
||||
zenbat
|
||||
zer
|
||||
zergatik
|
||||
ziren
|
||||
zituen
|
||||
zu
|
||||
zuek
|
||||
zuen
|
||||
zuten
|
||||
@@ -1,313 +0,0 @@
|
||||
# This file was created by Jacques Savoy and is distributed under the BSD license.
|
||||
# See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
# Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
# Note: by default this file is used after normalization, so when adding entries
|
||||
# to this file, use the arabic 'ي' instead of 'ی'
|
||||
انان
|
||||
نداشته
|
||||
سراسر
|
||||
خياه
|
||||
ايشان
|
||||
وي
|
||||
تاكنون
|
||||
بيشتري
|
||||
دوم
|
||||
پس
|
||||
ناشي
|
||||
وگو
|
||||
يا
|
||||
داشتند
|
||||
سپس
|
||||
هنگام
|
||||
هرگز
|
||||
پنج
|
||||
نشان
|
||||
امسال
|
||||
ديگر
|
||||
گروهي
|
||||
شدند
|
||||
چطور
|
||||
ده
|
||||
و
|
||||
دو
|
||||
نخستين
|
||||
ولي
|
||||
چرا
|
||||
چه
|
||||
وسط
|
||||
ه
|
||||
كدام
|
||||
قابل
|
||||
يك
|
||||
رفت
|
||||
هفت
|
||||
همچنين
|
||||
در
|
||||
هزار
|
||||
بله
|
||||
بلي
|
||||
شايد
|
||||
اما
|
||||
شناسي
|
||||
گرفته
|
||||
دهد
|
||||
داشته
|
||||
دانست
|
||||
داشتن
|
||||
خواهيم
|
||||
ميليارد
|
||||
وقتيكه
|
||||
امد
|
||||
خواهد
|
||||
جز
|
||||
اورده
|
||||
شده
|
||||
بلكه
|
||||
خدمات
|
||||
شدن
|
||||
برخي
|
||||
نبود
|
||||
بسياري
|
||||
جلوگيري
|
||||
حق
|
||||
كردند
|
||||
نوعي
|
||||
بعري
|
||||
نكرده
|
||||
نظير
|
||||
نبايد
|
||||
بوده
|
||||
بودن
|
||||
داد
|
||||
اورد
|
||||
هست
|
||||
جايي
|
||||
شود
|
||||
دنبال
|
||||
داده
|
||||
بايد
|
||||
سابق
|
||||
هيچ
|
||||
همان
|
||||
انجا
|
||||
كمتر
|
||||
كجاست
|
||||
گردد
|
||||
كسي
|
||||
تر
|
||||
مردم
|
||||
تان
|
||||
دادن
|
||||
بودند
|
||||
سري
|
||||
جدا
|
||||
ندارند
|
||||
مگر
|
||||
يكديگر
|
||||
دارد
|
||||
دهند
|
||||
بنابراين
|
||||
هنگامي
|
||||
سمت
|
||||
جا
|
||||
انچه
|
||||
خود
|
||||
دادند
|
||||
زياد
|
||||
دارند
|
||||
اثر
|
||||
بدون
|
||||
بهترين
|
||||
بيشتر
|
||||
البته
|
||||
به
|
||||
براساس
|
||||
بيرون
|
||||
كرد
|
||||
بعضي
|
||||
گرفت
|
||||
توي
|
||||
اي
|
||||
ميليون
|
||||
او
|
||||
جريان
|
||||
تول
|
||||
بر
|
||||
مانند
|
||||
برابر
|
||||
باشيم
|
||||
مدتي
|
||||
گويند
|
||||
اكنون
|
||||
تا
|
||||
تنها
|
||||
جديد
|
||||
چند
|
||||
بي
|
||||
نشده
|
||||
كردن
|
||||
كردم
|
||||
گويد
|
||||
كرده
|
||||
كنيم
|
||||
نمي
|
||||
نزد
|
||||
روي
|
||||
قصد
|
||||
فقط
|
||||
بالاي
|
||||
ديگران
|
||||
اين
|
||||
ديروز
|
||||
توسط
|
||||
سوم
|
||||
ايم
|
||||
دانند
|
||||
سوي
|
||||
استفاده
|
||||
شما
|
||||
كنار
|
||||
داريم
|
||||
ساخته
|
||||
طور
|
||||
امده
|
||||
رفته
|
||||
نخست
|
||||
بيست
|
||||
نزديك
|
||||
طي
|
||||
كنيد
|
||||
از
|
||||
انها
|
||||
تمامي
|
||||
داشت
|
||||
يكي
|
||||
طريق
|
||||
اش
|
||||
چيست
|
||||
روب
|
||||
نمايد
|
||||
گفت
|
||||
چندين
|
||||
چيزي
|
||||
تواند
|
||||
ام
|
||||
ايا
|
||||
با
|
||||
ان
|
||||
ايد
|
||||
ترين
|
||||
اينكه
|
||||
ديگري
|
||||
راه
|
||||
هايي
|
||||
بروز
|
||||
همچنان
|
||||
پاعين
|
||||
كس
|
||||
حدود
|
||||
مختلف
|
||||
مقابل
|
||||
چيز
|
||||
گيرد
|
||||
ندارد
|
||||
ضد
|
||||
همچون
|
||||
سازي
|
||||
شان
|
||||
مورد
|
||||
باره
|
||||
مرسي
|
||||
خويش
|
||||
برخوردار
|
||||
چون
|
||||
خارج
|
||||
شش
|
||||
هنوز
|
||||
تحت
|
||||
ضمن
|
||||
هستيم
|
||||
گفته
|
||||
فكر
|
||||
بسيار
|
||||
پيش
|
||||
براي
|
||||
روزهاي
|
||||
انكه
|
||||
نخواهد
|
||||
بالا
|
||||
كل
|
||||
وقتي
|
||||
كي
|
||||
چنين
|
||||
كه
|
||||
گيري
|
||||
نيست
|
||||
است
|
||||
كجا
|
||||
كند
|
||||
نيز
|
||||
يابد
|
||||
بندي
|
||||
حتي
|
||||
توانند
|
||||
عقب
|
||||
خواست
|
||||
كنند
|
||||
بين
|
||||
تمام
|
||||
همه
|
||||
ما
|
||||
باشند
|
||||
مثل
|
||||
شد
|
||||
اري
|
||||
باشد
|
||||
اره
|
||||
طبق
|
||||
بعد
|
||||
اگر
|
||||
صورت
|
||||
غير
|
||||
جاي
|
||||
بيش
|
||||
ريزي
|
||||
اند
|
||||
زيرا
|
||||
چگونه
|
||||
بار
|
||||
لطفا
|
||||
مي
|
||||
درباره
|
||||
من
|
||||
ديده
|
||||
همين
|
||||
گذاري
|
||||
برداري
|
||||
علت
|
||||
گذاشته
|
||||
هم
|
||||
فوق
|
||||
نه
|
||||
ها
|
||||
شوند
|
||||
اباد
|
||||
همواره
|
||||
هر
|
||||
اول
|
||||
خواهند
|
||||
چهار
|
||||
نام
|
||||
امروز
|
||||
مان
|
||||
هاي
|
||||
قبل
|
||||
كنم
|
||||
سعي
|
||||
تازه
|
||||
را
|
||||
هستند
|
||||
زير
|
||||
جلوي
|
||||
عنوان
|
||||
بود
|
||||
@@ -1,95 +0,0 @@
|
||||
| From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
||||
| forms of BE
|
||||
|
||||
olla
|
||||
olen
|
||||
olet
|
||||
on
|
||||
olemme
|
||||
olette
|
||||
ovat
|
||||
ole | negative form
|
||||
|
||||
oli
|
||||
olisi
|
||||
olisit
|
||||
olisin
|
||||
olisimme
|
||||
olisitte
|
||||
olisivat
|
||||
olit
|
||||
olin
|
||||
olimme
|
||||
olitte
|
||||
olivat
|
||||
ollut
|
||||
olleet
|
||||
|
||||
en | negation
|
||||
et
|
||||
ei
|
||||
emme
|
||||
ette
|
||||
eivät
|
||||
|
||||
|Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans
|
||||
minä minun minut minua minussa minusta minuun minulla minulta minulle | I
|
||||
sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you
|
||||
hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she
|
||||
me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we
|
||||
te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you
|
||||
he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they
|
||||
|
||||
tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this
|
||||
tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that
|
||||
se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it
|
||||
nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these
|
||||
nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those
|
||||
ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they
|
||||
|
||||
kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who
|
||||
ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl)
|
||||
mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what
|
||||
mitkä | (pl)
|
||||
|
||||
joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which
|
||||
jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl)
|
||||
|
||||
| conjunctions
|
||||
|
||||
että | that
|
||||
ja | and
|
||||
jos | if
|
||||
koska | because
|
||||
kuin | than
|
||||
mutta | but
|
||||
niin | so
|
||||
sekä | and
|
||||
sillä | for
|
||||
tai | or
|
||||
vaan | but
|
||||
vai | or
|
||||
vaikka | although
|
||||
|
||||
|
||||
| prepositions
|
||||
|
||||
kanssa | with
|
||||
mukaan | according to
|
||||
noin | about
|
||||
poikki | across
|
||||
yli | over, across
|
||||
|
||||
| other
|
||||
|
||||
kun | when
|
||||
niin | so
|
||||
nyt | now
|
||||
itse | self
|
||||
|
||||
@@ -1,184 +0,0 @@
|
||||
| From svn.tartarus.org/snowball/trunk/website/algorithms/french/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
||||
| A French stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
au | a + le
|
||||
aux | a + les
|
||||
avec | with
|
||||
ce | this
|
||||
ces | these
|
||||
dans | with
|
||||
de | of
|
||||
des | de + les
|
||||
du | de + le
|
||||
elle | she
|
||||
en | `of them' etc
|
||||
et | and
|
||||
eux | them
|
||||
il | he
|
||||
je | I
|
||||
la | the
|
||||
le | the
|
||||
leur | their
|
||||
lui | him
|
||||
ma | my (fem)
|
||||
mais | but
|
||||
me | me
|
||||
même | same; as in moi-même (myself) etc
|
||||
mes | me (pl)
|
||||
moi | me
|
||||
mon | my (masc)
|
||||
ne | not
|
||||
nos | our (pl)
|
||||
notre | our
|
||||
nous | we
|
||||
on | one
|
||||
ou | where
|
||||
par | by
|
||||
pas | not
|
||||
pour | for
|
||||
qu | que before vowel
|
||||
que | that
|
||||
qui | who
|
||||
sa | his, her (fem)
|
||||
se | oneself
|
||||
ses | his (pl)
|
||||
son | his, her (masc)
|
||||
sur | on
|
||||
ta | thy (fem)
|
||||
te | thee
|
||||
tes | thy (pl)
|
||||
toi | thee
|
||||
ton | thy (masc)
|
||||
tu | thou
|
||||
un | a
|
||||
une | a
|
||||
vos | your (pl)
|
||||
votre | your
|
||||
vous | you
|
||||
|
||||
| single letter forms
|
||||
|
||||
c | c'
|
||||
d | d'
|
||||
j | j'
|
||||
l | l'
|
||||
à | to, at
|
||||
m | m'
|
||||
n | n'
|
||||
s | s'
|
||||
t | t'
|
||||
y | there
|
||||
|
||||
| forms of être (not including the infinitive):
|
||||
été
|
||||
étée
|
||||
étées
|
||||
étés
|
||||
étant
|
||||
suis
|
||||
es
|
||||
est
|
||||
sommes
|
||||
êtes
|
||||
sont
|
||||
serai
|
||||
seras
|
||||
sera
|
||||
serons
|
||||
serez
|
||||
seront
|
||||
serais
|
||||
serait
|
||||
serions
|
||||
seriez
|
||||
seraient
|
||||
étais
|
||||
était
|
||||
étions
|
||||
étiez
|
||||
étaient
|
||||
fus
|
||||
fut
|
||||
fûmes
|
||||
fûtes
|
||||
furent
|
||||
sois
|
||||
soit
|
||||
soyons
|
||||
soyez
|
||||
soient
|
||||
fusse
|
||||
fusses
|
||||
fût
|
||||
fussions
|
||||
fussiez
|
||||
fussent
|
||||
|
||||
| forms of avoir (not including the infinitive):
|
||||
ayant
|
||||
eu
|
||||
eue
|
||||
eues
|
||||
eus
|
||||
ai
|
||||
as
|
||||
avons
|
||||
avez
|
||||
ont
|
||||
aurai
|
||||
auras
|
||||
aura
|
||||
aurons
|
||||
aurez
|
||||
auront
|
||||
aurais
|
||||
aurait
|
||||
aurions
|
||||
auriez
|
||||
auraient
|
||||
avais
|
||||
avait
|
||||
avions
|
||||
aviez
|
||||
avaient
|
||||
eut
|
||||
eûmes
|
||||
eûtes
|
||||
eurent
|
||||
aie
|
||||
aies
|
||||
ait
|
||||
ayons
|
||||
ayez
|
||||
aient
|
||||
eusse
|
||||
eusses
|
||||
eût
|
||||
eussions
|
||||
eussiez
|
||||
eussent
|
||||
|
||||
| Later additions (from Jean-Christophe Deschamps)
|
||||
ceci | this
|
||||
cela | that
|
||||
celà | that
|
||||
cet | this
|
||||
cette | this
|
||||
ici | here
|
||||
ils | they
|
||||
les | the (pl)
|
||||
leurs | their (pl)
|
||||
quel | which
|
||||
quels | which
|
||||
quelle | which
|
||||
quelles | which
|
||||
sans | without
|
||||
soi | oneself
|
||||
|
||||
@@ -1,110 +0,0 @@
|
||||
|
||||
a
|
||||
ach
|
||||
ag
|
||||
agus
|
||||
an
|
||||
aon
|
||||
ar
|
||||
arna
|
||||
as
|
||||
b'
|
||||
ba
|
||||
beirt
|
||||
bhúr
|
||||
caoga
|
||||
ceathair
|
||||
ceathrar
|
||||
chomh
|
||||
chtó
|
||||
chuig
|
||||
chun
|
||||
cois
|
||||
céad
|
||||
cúig
|
||||
cúigear
|
||||
d'
|
||||
daichead
|
||||
dar
|
||||
de
|
||||
deich
|
||||
deichniúr
|
||||
den
|
||||
dhá
|
||||
do
|
||||
don
|
||||
dtí
|
||||
dá
|
||||
dár
|
||||
dó
|
||||
faoi
|
||||
faoin
|
||||
faoina
|
||||
faoinár
|
||||
fara
|
||||
fiche
|
||||
gach
|
||||
gan
|
||||
go
|
||||
gur
|
||||
haon
|
||||
hocht
|
||||
i
|
||||
iad
|
||||
idir
|
||||
in
|
||||
ina
|
||||
ins
|
||||
inár
|
||||
is
|
||||
le
|
||||
leis
|
||||
lena
|
||||
lenár
|
||||
m'
|
||||
mar
|
||||
mo
|
||||
mé
|
||||
na
|
||||
nach
|
||||
naoi
|
||||
naonúr
|
||||
ná
|
||||
ní
|
||||
níor
|
||||
nó
|
||||
nócha
|
||||
ocht
|
||||
ochtar
|
||||
os
|
||||
roimh
|
||||
sa
|
||||
seacht
|
||||
seachtar
|
||||
seachtó
|
||||
seasca
|
||||
seisear
|
||||
siad
|
||||
sibh
|
||||
sinn
|
||||
sna
|
||||
sé
|
||||
sí
|
||||
tar
|
||||
thar
|
||||
thú
|
||||
triúr
|
||||
trí
|
||||
trína
|
||||
trínár
|
||||
tríocha
|
||||
tú
|
||||
um
|
||||
ár
|
||||
é
|
||||
éis
|
||||
í
|
||||
ó
|
||||
ón
|
||||
óna
|
||||
ónár
|
||||
@@ -1,161 +0,0 @@
|
||||
# galican stopwords
|
||||
a
|
||||
aínda
|
||||
alí
|
||||
aquel
|
||||
aquela
|
||||
aquelas
|
||||
aqueles
|
||||
aquilo
|
||||
aquí
|
||||
ao
|
||||
aos
|
||||
as
|
||||
así
|
||||
á
|
||||
ben
|
||||
cando
|
||||
che
|
||||
co
|
||||
coa
|
||||
comigo
|
||||
con
|
||||
connosco
|
||||
contigo
|
||||
convosco
|
||||
coas
|
||||
cos
|
||||
cun
|
||||
cuns
|
||||
cunha
|
||||
cunhas
|
||||
da
|
||||
dalgunha
|
||||
dalgunhas
|
||||
dalgún
|
||||
dalgúns
|
||||
das
|
||||
de
|
||||
del
|
||||
dela
|
||||
delas
|
||||
deles
|
||||
desde
|
||||
deste
|
||||
do
|
||||
dos
|
||||
dun
|
||||
duns
|
||||
dunha
|
||||
dunhas
|
||||
e
|
||||
el
|
||||
ela
|
||||
elas
|
||||
eles
|
||||
en
|
||||
era
|
||||
eran
|
||||
esa
|
||||
esas
|
||||
ese
|
||||
eses
|
||||
esta
|
||||
estar
|
||||
estaba
|
||||
está
|
||||
están
|
||||
este
|
||||
estes
|
||||
estiven
|
||||
estou
|
||||
eu
|
||||
é
|
||||
facer
|
||||
foi
|
||||
foron
|
||||
fun
|
||||
había
|
||||
hai
|
||||
iso
|
||||
isto
|
||||
la
|
||||
las
|
||||
lle
|
||||
lles
|
||||
lo
|
||||
los
|
||||
mais
|
||||
me
|
||||
meu
|
||||
meus
|
||||
min
|
||||
miña
|
||||
miñas
|
||||
moi
|
||||
na
|
||||
nas
|
||||
neste
|
||||
nin
|
||||
no
|
||||
non
|
||||
nos
|
||||
nosa
|
||||
nosas
|
||||
noso
|
||||
nosos
|
||||
nós
|
||||
nun
|
||||
nunha
|
||||
nuns
|
||||
nunhas
|
||||
o
|
||||
os
|
||||
ou
|
||||
ó
|
||||
ós
|
||||
para
|
||||
pero
|
||||
pode
|
||||
pois
|
||||
pola
|
||||
polas
|
||||
polo
|
||||
polos
|
||||
por
|
||||
que
|
||||
se
|
||||
senón
|
||||
ser
|
||||
seu
|
||||
seus
|
||||
sexa
|
||||
sido
|
||||
sobre
|
||||
súa
|
||||
súas
|
||||
tamén
|
||||
tan
|
||||
te
|
||||
ten
|
||||
teñen
|
||||
teño
|
||||
ter
|
||||
teu
|
||||
teus
|
||||
ti
|
||||
tido
|
||||
tiña
|
||||
tiven
|
||||
túa
|
||||
túas
|
||||
un
|
||||
unha
|
||||
unhas
|
||||
uns
|
||||
vos
|
||||
vosa
|
||||
vosas
|
||||
voso
|
||||
vosos
|
||||
vós
|
||||
@@ -1,235 +0,0 @@
|
||||
# Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
# See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
# This file was created by Jacques Savoy and is distributed under the BSD license.
|
||||
# Note: by default this file also contains forms normalized by HindiNormalizer
|
||||
# for spelling variation (see section below), such that it can be used whether or
|
||||
# not you enable that feature. When adding additional entries to this list,
|
||||
# please add the normalized form as well.
|
||||
अंदर
|
||||
अत
|
||||
अपना
|
||||
अपनी
|
||||
अपने
|
||||
अभी
|
||||
आदि
|
||||
आप
|
||||
इत्यादि
|
||||
इन
|
||||
इनका
|
||||
इन्हीं
|
||||
इन्हें
|
||||
इन्हों
|
||||
इस
|
||||
इसका
|
||||
इसकी
|
||||
इसके
|
||||
इसमें
|
||||
इसी
|
||||
इसे
|
||||
उन
|
||||
उनका
|
||||
उनकी
|
||||
उनके
|
||||
उनको
|
||||
उन्हीं
|
||||
उन्हें
|
||||
उन्हों
|
||||
उस
|
||||
उसके
|
||||
उसी
|
||||
उसे
|
||||
एक
|
||||
एवं
|
||||
एस
|
||||
ऐसे
|
||||
और
|
||||
कई
|
||||
कर
|
||||
करता
|
||||
करते
|
||||
करना
|
||||
करने
|
||||
करें
|
||||
कहते
|
||||
कहा
|
||||
का
|
||||
काफ़ी
|
||||
कि
|
||||
कितना
|
||||
किन्हें
|
||||
किन्हों
|
||||
किया
|
||||
किर
|
||||
किस
|
||||
किसी
|
||||
किसे
|
||||
की
|
||||
कुछ
|
||||
कुल
|
||||
के
|
||||
को
|
||||
कोई
|
||||
कौन
|
||||
कौनसा
|
||||
गया
|
||||
घर
|
||||
जब
|
||||
जहाँ
|
||||
जा
|
||||
जितना
|
||||
जिन
|
||||
जिन्हें
|
||||
जिन्हों
|
||||
जिस
|
||||
जिसे
|
||||
जीधर
|
||||
जैसा
|
||||
जैसे
|
||||
जो
|
||||
तक
|
||||
तब
|
||||
तरह
|
||||
तिन
|
||||
तिन्हें
|
||||
तिन्हों
|
||||
तिस
|
||||
तिसे
|
||||
तो
|
||||
था
|
||||
थी
|
||||
थे
|
||||
दबारा
|
||||
दिया
|
||||
दुसरा
|
||||
दूसरे
|
||||
दो
|
||||
द्वारा
|
||||
न
|
||||
नहीं
|
||||
ना
|
||||
निहायत
|
||||
नीचे
|
||||
ने
|
||||
पर
|
||||
पर
|
||||
पहले
|
||||
पूरा
|
||||
पे
|
||||
फिर
|
||||
बनी
|
||||
बही
|
||||
बहुत
|
||||
बाद
|
||||
बाला
|
||||
बिलकुल
|
||||
भी
|
||||
भीतर
|
||||
मगर
|
||||
मानो
|
||||
मे
|
||||
में
|
||||
यदि
|
||||
यह
|
||||
यहाँ
|
||||
यही
|
||||
या
|
||||
यिह
|
||||
ये
|
||||
रखें
|
||||
रहा
|
||||
रहे
|
||||
ऱ्वासा
|
||||
लिए
|
||||
लिये
|
||||
लेकिन
|
||||
व
|
||||
वर्ग
|
||||
वह
|
||||
वह
|
||||
वहाँ
|
||||
वहीं
|
||||
वाले
|
||||
वुह
|
||||
वे
|
||||
वग़ैरह
|
||||
संग
|
||||
सकता
|
||||
सकते
|
||||
सबसे
|
||||
सभी
|
||||
साथ
|
||||
साबुत
|
||||
साभ
|
||||
सारा
|
||||
से
|
||||
सो
|
||||
ही
|
||||
हुआ
|
||||
हुई
|
||||
हुए
|
||||
है
|
||||
हैं
|
||||
हो
|
||||
होता
|
||||
होती
|
||||
होते
|
||||
होना
|
||||
होने
|
||||
# additional normalized forms of the above
|
||||
अपनि
|
||||
जेसे
|
||||
होति
|
||||
सभि
|
||||
तिंहों
|
||||
इंहों
|
||||
दवारा
|
||||
इसि
|
||||
किंहें
|
||||
थि
|
||||
उंहों
|
||||
ओर
|
||||
जिंहें
|
||||
वहिं
|
||||
अभि
|
||||
बनि
|
||||
हि
|
||||
उंहिं
|
||||
उंहें
|
||||
हें
|
||||
वगेरह
|
||||
एसे
|
||||
रवासा
|
||||
कोन
|
||||
निचे
|
||||
काफि
|
||||
उसि
|
||||
पुरा
|
||||
भितर
|
||||
हे
|
||||
बहि
|
||||
वहां
|
||||
कोइ
|
||||
यहां
|
||||
जिंहों
|
||||
तिंहें
|
||||
किसि
|
||||
कइ
|
||||
यहि
|
||||
इंहिं
|
||||
जिधर
|
||||
इंहें
|
||||
अदि
|
||||
इतयादि
|
||||
हुइ
|
||||
कोनसा
|
||||
इसकि
|
||||
दुसरे
|
||||
जहां
|
||||
अप
|
||||
किंहों
|
||||
उनकि
|
||||
भि
|
||||
वरग
|
||||
हुअ
|
||||
जेसा
|
||||
नहिं
|
||||
@@ -1,209 +0,0 @@
|
||||
| From svn.tartarus.org/snowball/trunk/website/algorithms/hungarian/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
||||
| Hungarian stop word list
|
||||
| prepared by Anna Tordai
|
||||
|
||||
a
|
||||
ahogy
|
||||
ahol
|
||||
aki
|
||||
akik
|
||||
akkor
|
||||
alatt
|
||||
által
|
||||
általában
|
||||
amely
|
||||
amelyek
|
||||
amelyekben
|
||||
amelyeket
|
||||
amelyet
|
||||
amelynek
|
||||
ami
|
||||
amit
|
||||
amolyan
|
||||
amíg
|
||||
amikor
|
||||
át
|
||||
abban
|
||||
ahhoz
|
||||
annak
|
||||
arra
|
||||
arról
|
||||
az
|
||||
azok
|
||||
azon
|
||||
azt
|
||||
azzal
|
||||
azért
|
||||
aztán
|
||||
azután
|
||||
azonban
|
||||
bár
|
||||
be
|
||||
belül
|
||||
benne
|
||||
cikk
|
||||
cikkek
|
||||
cikkeket
|
||||
csak
|
||||
de
|
||||
e
|
||||
eddig
|
||||
egész
|
||||
egy
|
||||
egyes
|
||||
egyetlen
|
||||
egyéb
|
||||
egyik
|
||||
egyre
|
||||
ekkor
|
||||
el
|
||||
elég
|
||||
ellen
|
||||
elő
|
||||
először
|
||||
előtt
|
||||
első
|
||||
én
|
||||
éppen
|
||||
ebben
|
||||
ehhez
|
||||
emilyen
|
||||
ennek
|
||||
erre
|
||||
ez
|
||||
ezt
|
||||
ezek
|
||||
ezen
|
||||
ezzel
|
||||
ezért
|
||||
és
|
||||
fel
|
||||
felé
|
||||
hanem
|
||||
hiszen
|
||||
hogy
|
||||
hogyan
|
||||
igen
|
||||
így
|
||||
illetve
|
||||
ill.
|
||||
ill
|
||||
ilyen
|
||||
ilyenkor
|
||||
ison
|
||||
ismét
|
||||
itt
|
||||
jó
|
||||
jól
|
||||
jobban
|
||||
kell
|
||||
kellett
|
||||
keresztül
|
||||
keressünk
|
||||
ki
|
||||
kívül
|
||||
között
|
||||
közül
|
||||
legalább
|
||||
lehet
|
||||
lehetett
|
||||
legyen
|
||||
lenne
|
||||
lenni
|
||||
lesz
|
||||
lett
|
||||
maga
|
||||
magát
|
||||
majd
|
||||
majd
|
||||
már
|
||||
más
|
||||
másik
|
||||
meg
|
||||
még
|
||||
mellett
|
||||
mert
|
||||
mely
|
||||
melyek
|
||||
mi
|
||||
mit
|
||||
míg
|
||||
miért
|
||||
milyen
|
||||
mikor
|
||||
minden
|
||||
mindent
|
||||
mindenki
|
||||
mindig
|
||||
mint
|
||||
mintha
|
||||
mivel
|
||||
most
|
||||
nagy
|
||||
nagyobb
|
||||
nagyon
|
||||
ne
|
||||
néha
|
||||
nekem
|
||||
neki
|
||||
nem
|
||||
néhány
|
||||
nélkül
|
||||
nincs
|
||||
olyan
|
||||
ott
|
||||
össze
|
||||
ő
|
||||
ők
|
||||
őket
|
||||
pedig
|
||||
persze
|
||||
rá
|
||||
s
|
||||
saját
|
||||
sem
|
||||
semmi
|
||||
sok
|
||||
sokat
|
||||
sokkal
|
||||
számára
|
||||
szemben
|
||||
szerint
|
||||
szinte
|
||||
talán
|
||||
tehát
|
||||
teljes
|
||||
tovább
|
||||
továbbá
|
||||
több
|
||||
úgy
|
||||
ugyanis
|
||||
új
|
||||
újabb
|
||||
újra
|
||||
után
|
||||
utána
|
||||
utolsó
|
||||
vagy
|
||||
vagyis
|
||||
valaki
|
||||
valami
|
||||
valamint
|
||||
való
|
||||
vagyok
|
||||
van
|
||||
vannak
|
||||
volt
|
||||
voltam
|
||||
voltak
|
||||
voltunk
|
||||
vissza
|
||||
vele
|
||||
viszont
|
||||
volna
|
||||
@@ -1,46 +0,0 @@
|
||||
# example set of Armenian stopwords.
|
||||
այդ
|
||||
այլ
|
||||
այն
|
||||
այս
|
||||
դու
|
||||
դուք
|
||||
եմ
|
||||
են
|
||||
ենք
|
||||
ես
|
||||
եք
|
||||
է
|
||||
էի
|
||||
էին
|
||||
էինք
|
||||
էիր
|
||||
էիք
|
||||
էր
|
||||
ըստ
|
||||
թ
|
||||
ի
|
||||
ին
|
||||
իսկ
|
||||
իր
|
||||
կամ
|
||||
համար
|
||||
հետ
|
||||
հետո
|
||||
մենք
|
||||
մեջ
|
||||
մի
|
||||
ն
|
||||
նա
|
||||
նաև
|
||||
նրա
|
||||
նրանք
|
||||
որ
|
||||
որը
|
||||
որոնք
|
||||
որպես
|
||||
ու
|
||||
ում
|
||||
պիտի
|
||||
վրա
|
||||
և
|
||||
@@ -1,359 +0,0 @@
|
||||
# from appendix D of: A Study of Stemming Effects on Information
|
||||
# Retrieval in Bahasa Indonesia
|
||||
ada
|
||||
adanya
|
||||
adalah
|
||||
adapun
|
||||
agak
|
||||
agaknya
|
||||
agar
|
||||
akan
|
||||
akankah
|
||||
akhirnya
|
||||
aku
|
||||
akulah
|
||||
amat
|
||||
amatlah
|
||||
anda
|
||||
andalah
|
||||
antar
|
||||
diantaranya
|
||||
antara
|
||||
antaranya
|
||||
diantara
|
||||
apa
|
||||
apaan
|
||||
mengapa
|
||||
apabila
|
||||
apakah
|
||||
apalagi
|
||||
apatah
|
||||
atau
|
||||
ataukah
|
||||
ataupun
|
||||
bagai
|
||||
bagaikan
|
||||
sebagai
|
||||
sebagainya
|
||||
bagaimana
|
||||
bagaimanapun
|
||||
sebagaimana
|
||||
bagaimanakah
|
||||
bagi
|
||||
bahkan
|
||||
bahwa
|
||||
bahwasanya
|
||||
sebaliknya
|
||||
banyak
|
||||
sebanyak
|
||||
beberapa
|
||||
seberapa
|
||||
begini
|
||||
beginian
|
||||
beginikah
|
||||
beginilah
|
||||
sebegini
|
||||
begitu
|
||||
begitukah
|
||||
begitulah
|
||||
begitupun
|
||||
sebegitu
|
||||
belum
|
||||
belumlah
|
||||
sebelum
|
||||
sebelumnya
|
||||
sebenarnya
|
||||
berapa
|
||||
berapakah
|
||||
berapalah
|
||||
berapapun
|
||||
betulkah
|
||||
sebetulnya
|
||||
biasa
|
||||
biasanya
|
||||
bila
|
||||
bilakah
|
||||
bisa
|
||||
bisakah
|
||||
sebisanya
|
||||
boleh
|
||||
bolehkah
|
||||
bolehlah
|
||||
buat
|
||||
bukan
|
||||
bukankah
|
||||
bukanlah
|
||||
bukannya
|
||||
cuma
|
||||
percuma
|
||||
dahulu
|
||||
dalam
|
||||
dan
|
||||
dapat
|
||||
dari
|
||||
daripada
|
||||
dekat
|
||||
demi
|
||||
demikian
|
||||
demikianlah
|
||||
sedemikian
|
||||
dengan
|
||||
depan
|
||||
di
|
||||
dia
|
||||
dialah
|
||||
dini
|
||||
diri
|
||||
dirinya
|
||||
terdiri
|
||||
dong
|
||||
dulu
|
||||
enggak
|
||||
enggaknya
|
||||
entah
|
||||
entahlah
|
||||
terhadap
|
||||
terhadapnya
|
||||
hal
|
||||
hampir
|
||||
hanya
|
||||
hanyalah
|
||||
harus
|
||||
haruslah
|
||||
harusnya
|
||||
seharusnya
|
||||
hendak
|
||||
hendaklah
|
||||
hendaknya
|
||||
hingga
|
||||
sehingga
|
||||
ia
|
||||
ialah
|
||||
ibarat
|
||||
ingin
|
||||
inginkah
|
||||
inginkan
|
||||
ini
|
||||
inikah
|
||||
inilah
|
||||
itu
|
||||
itukah
|
||||
itulah
|
||||
jangan
|
||||
jangankan
|
||||
janganlah
|
||||
jika
|
||||
jikalau
|
||||
juga
|
||||
justru
|
||||
kala
|
||||
kalau
|
||||
kalaulah
|
||||
kalaupun
|
||||
kalian
|
||||
kami
|
||||
kamilah
|
||||
kamu
|
||||
kamulah
|
||||
kan
|
||||
kapan
|
||||
kapankah
|
||||
kapanpun
|
||||
dikarenakan
|
||||
karena
|
||||
karenanya
|
||||
ke
|
||||
kecil
|
||||
kemudian
|
||||
kenapa
|
||||
kepada
|
||||
kepadanya
|
||||
ketika
|
||||
seketika
|
||||
khususnya
|
||||
kini
|
||||
kinilah
|
||||
kiranya
|
||||
sekiranya
|
||||
kita
|
||||
kitalah
|
||||
kok
|
||||
lagi
|
||||
lagian
|
||||
selagi
|
||||
lah
|
||||
lain
|
||||
lainnya
|
||||
melainkan
|
||||
selaku
|
||||
lalu
|
||||
melalui
|
||||
terlalu
|
||||
lama
|
||||
lamanya
|
||||
selama
|
||||
selama
|
||||
selamanya
|
||||
lebih
|
||||
terlebih
|
||||
bermacam
|
||||
macam
|
||||
semacam
|
||||
maka
|
||||
makanya
|
||||
makin
|
||||
malah
|
||||
malahan
|
||||
mampu
|
||||
mampukah
|
||||
mana
|
||||
manakala
|
||||
manalagi
|
||||
masih
|
||||
masihkah
|
||||
semasih
|
||||
masing
|
||||
mau
|
||||
maupun
|
||||
semaunya
|
||||
memang
|
||||
mereka
|
||||
merekalah
|
||||
meski
|
||||
meskipun
|
||||
semula
|
||||
mungkin
|
||||
mungkinkah
|
||||
nah
|
||||
namun
|
||||
nanti
|
||||
nantinya
|
||||
nyaris
|
||||
oleh
|
||||
olehnya
|
||||
seorang
|
||||
seseorang
|
||||
pada
|
||||
padanya
|
||||
padahal
|
||||
paling
|
||||
sepanjang
|
||||
pantas
|
||||
sepantasnya
|
||||
sepantasnyalah
|
||||
para
|
||||
pasti
|
||||
pastilah
|
||||
per
|
||||
pernah
|
||||
pula
|
||||
pun
|
||||
merupakan
|
||||
rupanya
|
||||
serupa
|
||||
saat
|
||||
saatnya
|
||||
sesaat
|
||||
saja
|
||||
sajalah
|
||||
saling
|
||||
bersama
|
||||
sama
|
||||
sesama
|
||||
sambil
|
||||
sampai
|
||||
sana
|
||||
sangat
|
||||
sangatlah
|
||||
saya
|
||||
sayalah
|
||||
se
|
||||
sebab
|
||||
sebabnya
|
||||
sebuah
|
||||
tersebut
|
||||
tersebutlah
|
||||
sedang
|
||||
sedangkan
|
||||
sedikit
|
||||
sedikitnya
|
||||
segala
|
||||
segalanya
|
||||
segera
|
||||
sesegera
|
||||
sejak
|
||||
sejenak
|
||||
sekali
|
||||
sekalian
|
||||
sekalipun
|
||||
sesekali
|
||||
sekaligus
|
||||
sekarang
|
||||
sekarang
|
||||
sekitar
|
||||
sekitarnya
|
||||
sela
|
||||
selain
|
||||
selalu
|
||||
seluruh
|
||||
seluruhnya
|
||||
semakin
|
||||
sementara
|
||||
sempat
|
||||
semua
|
||||
semuanya
|
||||
sendiri
|
||||
sendirinya
|
||||
seolah
|
||||
seperti
|
||||
sepertinya
|
||||
sering
|
||||
seringnya
|
||||
serta
|
||||
siapa
|
||||
siapakah
|
||||
siapapun
|
||||
disini
|
||||
disinilah
|
||||
sini
|
||||
sinilah
|
||||
sesuatu
|
||||
sesuatunya
|
||||
suatu
|
||||
sesudah
|
||||
sesudahnya
|
||||
sudah
|
||||
sudahkah
|
||||
sudahlah
|
||||
supaya
|
||||
tadi
|
||||
tadinya
|
||||
tak
|
||||
tanpa
|
||||
setelah
|
||||
telah
|
||||
tentang
|
||||
tentu
|
||||
tentulah
|
||||
tentunya
|
||||
tertentu
|
||||
seterusnya
|
||||
tapi
|
||||
tetapi
|
||||
setiap
|
||||
tiap
|
||||
setidaknya
|
||||
tidak
|
||||
tidakkah
|
||||
tidaklah
|
||||
toh
|
||||
waduh
|
||||
wah
|
||||
wahai
|
||||
sewaktu
|
||||
walau
|
||||
walaupun
|
||||
wong
|
||||
yaitu
|
||||
yakni
|
||||
yang
|
||||
@@ -1,301 +0,0 @@
|
||||
| From svn.tartarus.org/snowball/trunk/website/algorithms/italian/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
||||
| An Italian stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
ad | a (to) before vowel
|
||||
al | a + il
|
||||
allo | a + lo
|
||||
ai | a + i
|
||||
agli | a + gli
|
||||
all | a + l'
|
||||
agl | a + gl'
|
||||
alla | a + la
|
||||
alle | a + le
|
||||
con | with
|
||||
col | con + il
|
||||
coi | con + i (forms collo, cogli etc are now very rare)
|
||||
da | from
|
||||
dal | da + il
|
||||
dallo | da + lo
|
||||
dai | da + i
|
||||
dagli | da + gli
|
||||
dall | da + l'
|
||||
dagl | da + gll'
|
||||
dalla | da + la
|
||||
dalle | da + le
|
||||
di | of
|
||||
del | di + il
|
||||
dello | di + lo
|
||||
dei | di + i
|
||||
degli | di + gli
|
||||
dell | di + l'
|
||||
degl | di + gl'
|
||||
della | di + la
|
||||
delle | di + le
|
||||
in | in
|
||||
nel | in + el
|
||||
nello | in + lo
|
||||
nei | in + i
|
||||
negli | in + gli
|
||||
nell | in + l'
|
||||
negl | in + gl'
|
||||
nella | in + la
|
||||
nelle | in + le
|
||||
su | on
|
||||
sul | su + il
|
||||
sullo | su + lo
|
||||
sui | su + i
|
||||
sugli | su + gli
|
||||
sull | su + l'
|
||||
sugl | su + gl'
|
||||
sulla | su + la
|
||||
sulle | su + le
|
||||
per | through, by
|
||||
tra | among
|
||||
contro | against
|
||||
io | I
|
||||
tu | thou
|
||||
lui | he
|
||||
lei | she
|
||||
noi | we
|
||||
voi | you
|
||||
loro | they
|
||||
mio | my
|
||||
mia |
|
||||
miei |
|
||||
mie |
|
||||
tuo |
|
||||
tua |
|
||||
tuoi | thy
|
||||
tue |
|
||||
suo |
|
||||
sua |
|
||||
suoi | his, her
|
||||
sue |
|
||||
nostro | our
|
||||
nostra |
|
||||
nostri |
|
||||
nostre |
|
||||
vostro | your
|
||||
vostra |
|
||||
vostri |
|
||||
vostre |
|
||||
mi | me
|
||||
ti | thee
|
||||
ci | us, there
|
||||
vi | you, there
|
||||
lo | him, the
|
||||
la | her, the
|
||||
li | them
|
||||
le | them, the
|
||||
gli | to him, the
|
||||
ne | from there etc
|
||||
il | the
|
||||
un | a
|
||||
uno | a
|
||||
una | a
|
||||
ma | but
|
||||
ed | and
|
||||
se | if
|
||||
perché | why, because
|
||||
anche | also
|
||||
come | how
|
||||
dov | where (as dov')
|
||||
dove | where
|
||||
che | who, that
|
||||
chi | who
|
||||
cui | whom
|
||||
non | not
|
||||
più | more
|
||||
quale | who, that
|
||||
quanto | how much
|
||||
quanti |
|
||||
quanta |
|
||||
quante |
|
||||
quello | that
|
||||
quelli |
|
||||
quella |
|
||||
quelle |
|
||||
questo | this
|
||||
questi |
|
||||
questa |
|
||||
queste |
|
||||
si | yes
|
||||
tutto | all
|
||||
tutti | all
|
||||
|
||||
| single letter forms:
|
||||
|
||||
a | at
|
||||
c | as c' for ce or ci
|
||||
e | and
|
||||
i | the
|
||||
l | as l'
|
||||
o | or
|
||||
|
||||
| forms of avere, to have (not including the infinitive):
|
||||
|
||||
ho
|
||||
hai
|
||||
ha
|
||||
abbiamo
|
||||
avete
|
||||
hanno
|
||||
abbia
|
||||
abbiate
|
||||
abbiano
|
||||
avrò
|
||||
avrai
|
||||
avrà
|
||||
avremo
|
||||
avrete
|
||||
avranno
|
||||
avrei
|
||||
avresti
|
||||
avrebbe
|
||||
avremmo
|
||||
avreste
|
||||
avrebbero
|
||||
avevo
|
||||
avevi
|
||||
aveva
|
||||
avevamo
|
||||
avevate
|
||||
avevano
|
||||
ebbi
|
||||
avesti
|
||||
ebbe
|
||||
avemmo
|
||||
aveste
|
||||
ebbero
|
||||
avessi
|
||||
avesse
|
||||
avessimo
|
||||
avessero
|
||||
avendo
|
||||
avuto
|
||||
avuta
|
||||
avuti
|
||||
avute
|
||||
|
||||
| forms of essere, to be (not including the infinitive):
|
||||
sono
|
||||
sei
|
||||
è
|
||||
siamo
|
||||
siete
|
||||
sia
|
||||
siate
|
||||
siano
|
||||
sarò
|
||||
sarai
|
||||
sarà
|
||||
saremo
|
||||
sarete
|
||||
saranno
|
||||
sarei
|
||||
saresti
|
||||
sarebbe
|
||||
saremmo
|
||||
sareste
|
||||
sarebbero
|
||||
ero
|
||||
eri
|
||||
era
|
||||
eravamo
|
||||
eravate
|
||||
erano
|
||||
fui
|
||||
fosti
|
||||
fu
|
||||
fummo
|
||||
foste
|
||||
furono
|
||||
fossi
|
||||
fosse
|
||||
fossimo
|
||||
fossero
|
||||
essendo
|
||||
|
||||
| forms of fare, to do (not including the infinitive, fa, fat-):
|
||||
faccio
|
||||
fai
|
||||
facciamo
|
||||
fanno
|
||||
faccia
|
||||
facciate
|
||||
facciano
|
||||
farò
|
||||
farai
|
||||
farà
|
||||
faremo
|
||||
farete
|
||||
faranno
|
||||
farei
|
||||
faresti
|
||||
farebbe
|
||||
faremmo
|
||||
fareste
|
||||
farebbero
|
||||
facevo
|
||||
facevi
|
||||
faceva
|
||||
facevamo
|
||||
facevate
|
||||
facevano
|
||||
feci
|
||||
facesti
|
||||
fece
|
||||
facemmo
|
||||
faceste
|
||||
fecero
|
||||
facessi
|
||||
facesse
|
||||
facessimo
|
||||
facessero
|
||||
facendo
|
||||
|
||||
| forms of stare, to be (not including the infinitive):
|
||||
sto
|
||||
stai
|
||||
sta
|
||||
stiamo
|
||||
stanno
|
||||
stia
|
||||
stiate
|
||||
stiano
|
||||
starò
|
||||
starai
|
||||
starà
|
||||
staremo
|
||||
starete
|
||||
staranno
|
||||
starei
|
||||
staresti
|
||||
starebbe
|
||||
staremmo
|
||||
stareste
|
||||
starebbero
|
||||
stavo
|
||||
stavi
|
||||
stava
|
||||
stavamo
|
||||
stavate
|
||||
stavano
|
||||
stetti
|
||||
stesti
|
||||
stette
|
||||
stemmo
|
||||
steste
|
||||
stettero
|
||||
stessi
|
||||
stesse
|
||||
stessimo
|
||||
stessero
|
||||
stando
|
||||
@@ -1,127 +0,0 @@
|
||||
#
|
||||
# This file defines a stopword set for Japanese.
|
||||
#
|
||||
# This set is made up of hand-picked frequent terms from segmented Japanese Wikipedia.
|
||||
# Punctuation characters and frequent kanji have mostly been left out. See LUCENE-3745
|
||||
# for frequency lists, etc. that can be useful for making your own set (if desired)
|
||||
#
|
||||
# Note that there is an overlap between these stopwords and the terms stopped when used
|
||||
# in combination with the JapanesePartOfSpeechStopFilter. When editing this file, note
|
||||
# that comments are not allowed on the same line as stopwords.
|
||||
#
|
||||
# Also note that stopping is done in a case-insensitive manner. Change your StopFilter
|
||||
# configuration if you need case-sensitive stopping. Lastly, note that stopping is done
|
||||
# using the same character width as the entries in this file. Since this StopFilter is
|
||||
# normally done after a CJKWidthFilter in your chain, you would usually want your romaji
|
||||
# entries to be in half-width and your kana entries to be in full-width.
|
||||
#
|
||||
の
|
||||
に
|
||||
は
|
||||
を
|
||||
た
|
||||
が
|
||||
で
|
||||
て
|
||||
と
|
||||
し
|
||||
れ
|
||||
さ
|
||||
ある
|
||||
いる
|
||||
も
|
||||
する
|
||||
から
|
||||
な
|
||||
こと
|
||||
として
|
||||
い
|
||||
や
|
||||
れる
|
||||
など
|
||||
なっ
|
||||
ない
|
||||
この
|
||||
ため
|
||||
その
|
||||
あっ
|
||||
よう
|
||||
また
|
||||
もの
|
||||
という
|
||||
あり
|
||||
まで
|
||||
られ
|
||||
なる
|
||||
へ
|
||||
か
|
||||
だ
|
||||
これ
|
||||
によって
|
||||
により
|
||||
おり
|
||||
より
|
||||
による
|
||||
ず
|
||||
なり
|
||||
られる
|
||||
において
|
||||
ば
|
||||
なかっ
|
||||
なく
|
||||
しかし
|
||||
について
|
||||
せ
|
||||
だっ
|
||||
その後
|
||||
できる
|
||||
それ
|
||||
う
|
||||
ので
|
||||
なお
|
||||
のみ
|
||||
でき
|
||||
き
|
||||
つ
|
||||
における
|
||||
および
|
||||
いう
|
||||
さらに
|
||||
でも
|
||||
ら
|
||||
たり
|
||||
その他
|
||||
に関する
|
||||
たち
|
||||
ます
|
||||
ん
|
||||
なら
|
||||
に対して
|
||||
特に
|
||||
せる
|
||||
及び
|
||||
これら
|
||||
とき
|
||||
では
|
||||
にて
|
||||
ほか
|
||||
ながら
|
||||
うち
|
||||
そして
|
||||
とともに
|
||||
ただし
|
||||
かつて
|
||||
それぞれ
|
||||
または
|
||||
お
|
||||
ほど
|
||||
ものの
|
||||
に対する
|
||||
ほとんど
|
||||
と共に
|
||||
といった
|
||||
です
|
||||
とも
|
||||
ところ
|
||||
ここ
|
||||
##### End of file
|
||||
@@ -1,172 +0,0 @@
|
||||
# Set of Latvian stopwords from A Stemming Algorithm for Latvian, Karlis Kreslins
|
||||
# the original list of over 800 forms was refined:
|
||||
# pronouns, adverbs, interjections were removed
|
||||
#
|
||||
# prepositions
|
||||
aiz
|
||||
ap
|
||||
ar
|
||||
apakš
|
||||
ārpus
|
||||
augšpus
|
||||
bez
|
||||
caur
|
||||
dēļ
|
||||
gar
|
||||
iekš
|
||||
iz
|
||||
kopš
|
||||
labad
|
||||
lejpus
|
||||
līdz
|
||||
no
|
||||
otrpus
|
||||
pa
|
||||
par
|
||||
pār
|
||||
pēc
|
||||
pie
|
||||
pirms
|
||||
pret
|
||||
priekš
|
||||
starp
|
||||
šaipus
|
||||
uz
|
||||
viņpus
|
||||
virs
|
||||
virspus
|
||||
zem
|
||||
apakšpus
|
||||
# Conjunctions
|
||||
un
|
||||
bet
|
||||
jo
|
||||
ja
|
||||
ka
|
||||
lai
|
||||
tomēr
|
||||
tikko
|
||||
turpretī
|
||||
arī
|
||||
kaut
|
||||
gan
|
||||
tādēļ
|
||||
tā
|
||||
ne
|
||||
tikvien
|
||||
vien
|
||||
kā
|
||||
ir
|
||||
te
|
||||
vai
|
||||
kamēr
|
||||
# Particles
|
||||
ar
|
||||
diezin
|
||||
droši
|
||||
diemžēl
|
||||
nebūt
|
||||
ik
|
||||
it
|
||||
taču
|
||||
nu
|
||||
pat
|
||||
tiklab
|
||||
iekšpus
|
||||
nedz
|
||||
tik
|
||||
nevis
|
||||
turpretim
|
||||
jeb
|
||||
iekam
|
||||
iekām
|
||||
iekāms
|
||||
kolīdz
|
||||
līdzko
|
||||
tiklīdz
|
||||
jebšu
|
||||
tālab
|
||||
tāpēc
|
||||
nekā
|
||||
itin
|
||||
jā
|
||||
jau
|
||||
jel
|
||||
nē
|
||||
nezin
|
||||
tad
|
||||
tikai
|
||||
vis
|
||||
tak
|
||||
iekams
|
||||
vien
|
||||
# modal verbs
|
||||
būt
|
||||
biju
|
||||
biji
|
||||
bija
|
||||
bijām
|
||||
bijāt
|
||||
esmu
|
||||
esi
|
||||
esam
|
||||
esat
|
||||
būšu
|
||||
būsi
|
||||
būs
|
||||
būsim
|
||||
būsiet
|
||||
tikt
|
||||
tiku
|
||||
tiki
|
||||
tika
|
||||
tikām
|
||||
tikāt
|
||||
tieku
|
||||
tiec
|
||||
tiek
|
||||
tiekam
|
||||
tiekat
|
||||
tikšu
|
||||
tiks
|
||||
tiksim
|
||||
tiksiet
|
||||
tapt
|
||||
tapi
|
||||
tapāt
|
||||
topat
|
||||
tapšu
|
||||
tapsi
|
||||
taps
|
||||
tapsim
|
||||
tapsiet
|
||||
kļūt
|
||||
kļuvu
|
||||
kļuvi
|
||||
kļuva
|
||||
kļuvām
|
||||
kļuvāt
|
||||
kļūstu
|
||||
kļūsti
|
||||
kļūst
|
||||
kļūstam
|
||||
kļūstat
|
||||
kļūšu
|
||||
kļūsi
|
||||
kļūs
|
||||
kļūsim
|
||||
kļūsiet
|
||||
# verbs
|
||||
varēt
|
||||
varēju
|
||||
varējām
|
||||
varēšu
|
||||
varēsim
|
||||
var
|
||||
varēji
|
||||
varējāt
|
||||
varēsi
|
||||
varēsiet
|
||||
varat
|
||||
varēja
|
||||
varēs
|
||||
@@ -1,117 +0,0 @@
|
||||
| From svn.tartarus.org/snowball/trunk/website/algorithms/dutch/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
||||
| A Dutch stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
| This is a ranked list (commonest to rarest) of stopwords derived from
|
||||
| a large sample of Dutch text.
|
||||
|
||||
| Dutch stop words frequently exhibit homonym clashes. These are indicated
|
||||
| clearly below.
|
||||
|
||||
de | the
|
||||
en | and
|
||||
van | of, from
|
||||
ik | I, the ego
|
||||
te | (1) chez, at etc, (2) to, (3) too
|
||||
dat | that, which
|
||||
die | that, those, who, which
|
||||
in | in, inside
|
||||
een | a, an, one
|
||||
hij | he
|
||||
het | the, it
|
||||
niet | not, nothing, naught
|
||||
zijn | (1) to be, being, (2) his, one's, its
|
||||
is | is
|
||||
was | (1) was, past tense of all persons sing. of 'zijn' (to be) (2) wax, (3) the washing, (4) rise of river
|
||||
op | on, upon, at, in, up, used up
|
||||
aan | on, upon, to (as dative)
|
||||
met | with, by
|
||||
als | like, such as, when
|
||||
voor | (1) before, in front of, (2) furrow
|
||||
had | had, past tense all persons sing. of 'hebben' (have)
|
||||
er | there
|
||||
maar | but, only
|
||||
om | round, about, for etc
|
||||
hem | him
|
||||
dan | then
|
||||
zou | should/would, past tense all persons sing. of 'zullen'
|
||||
of | or, whether, if
|
||||
wat | what, something, anything
|
||||
mijn | possessive and noun 'mine'
|
||||
men | people, 'one'
|
||||
dit | this
|
||||
zo | so, thus, in this way
|
||||
door | through by
|
||||
over | over, across
|
||||
ze | she, her, they, them
|
||||
zich | oneself
|
||||
bij | (1) a bee, (2) by, near, at
|
||||
ook | also, too
|
||||
tot | till, until
|
||||
je | you
|
||||
mij | me
|
||||
uit | out of, from
|
||||
der | Old Dutch form of 'van der' still found in surnames
|
||||
daar | (1) there, (2) because
|
||||
haar | (1) her, their, them, (2) hair
|
||||
naar | (1) unpleasant, unwell etc, (2) towards, (3) as
|
||||
heb | present first person sing. of 'to have'
|
||||
hoe | how, why
|
||||
heeft | present third person sing. of 'to have'
|
||||
hebben | 'to have' and various parts thereof
|
||||
deze | this
|
||||
u | you
|
||||
want | (1) for, (2) mitten, (3) rigging
|
||||
nog | yet, still
|
||||
zal | 'shall', first and third person sing. of verb 'zullen' (will)
|
||||
me | me
|
||||
zij | she, they
|
||||
nu | now
|
||||
ge | 'thou', still used in Belgium and south Netherlands
|
||||
geen | none
|
||||
omdat | because
|
||||
iets | something, somewhat
|
||||
worden | to become, grow, get
|
||||
toch | yet, still
|
||||
al | all, every, each
|
||||
waren | (1) 'were' (2) to wander, (3) wares, (3)
|
||||
veel | much, many
|
||||
meer | (1) more, (2) lake
|
||||
doen | to do, to make
|
||||
toen | then, when
|
||||
moet | noun 'spot/mote' and present form of 'to must'
|
||||
ben | (1) am, (2) 'are' in interrogative second person singular of 'to be'
|
||||
zonder | without
|
||||
kan | noun 'can' and present form of 'to be able'
|
||||
hun | their, them
|
||||
dus | so, consequently
|
||||
alles | all, everything, anything
|
||||
onder | under, beneath
|
||||
ja | yes, of course
|
||||
eens | once, one day
|
||||
hier | here
|
||||
wie | who
|
||||
werd | imperfect third person sing. of 'become'
|
||||
altijd | always
|
||||
doch | yet, but etc
|
||||
wordt | present third person sing. of 'become'
|
||||
wezen | (1) to be, (2) 'been' as in 'been fishing', (3) orphans
|
||||
kunnen | to be able
|
||||
ons | us/our
|
||||
zelf | self
|
||||
tegen | against, towards, at
|
||||
na | after, near
|
||||
reeds | already
|
||||
wil | (1) present tense of 'want', (2) 'will', noun, (3) fender
|
||||
kon | could; past tense of 'to be able'
|
||||
niets | nothing
|
||||
uw | your
|
||||
iemand | somebody
|
||||
geweest | been; past participle of 'be'
|
||||
andere | other
|
||||
@@ -1,192 +0,0 @@
|
||||
| From svn.tartarus.org/snowball/trunk/website/algorithms/norwegian/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
||||
| A Norwegian stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
| This stop word list is for the dominant bokmål dialect. Words unique
|
||||
| to nynorsk are marked *.
|
||||
|
||||
| Revised by Jan Bruusgaard <Jan.Bruusgaard@ssb.no>, Jan 2005
|
||||
|
||||
og | and
|
||||
i | in
|
||||
jeg | I
|
||||
det | it/this/that
|
||||
at | to (w. inf.)
|
||||
en | a/an
|
||||
et | a/an
|
||||
den | it/this/that
|
||||
til | to
|
||||
er | is/am/are
|
||||
som | who/that
|
||||
på | on
|
||||
de | they / you(formal)
|
||||
med | with
|
||||
han | he
|
||||
av | of
|
||||
ikke | not
|
||||
ikkje | not *
|
||||
der | there
|
||||
så | so
|
||||
var | was/were
|
||||
meg | me
|
||||
seg | you
|
||||
men | but
|
||||
ett | one
|
||||
har | have
|
||||
om | about
|
||||
vi | we
|
||||
min | my
|
||||
mitt | my
|
||||
ha | have
|
||||
hadde | had
|
||||
hun | she
|
||||
nå | now
|
||||
over | over
|
||||
da | when/as
|
||||
ved | by/know
|
||||
fra | from
|
||||
du | you
|
||||
ut | out
|
||||
sin | your
|
||||
dem | them
|
||||
oss | us
|
||||
opp | up
|
||||
man | you/one
|
||||
kan | can
|
||||
hans | his
|
||||
hvor | where
|
||||
eller | or
|
||||
hva | what
|
||||
skal | shall/must
|
||||
selv | self (reflective)
|
||||
sjøl | self (reflective)
|
||||
her | here
|
||||
alle | all
|
||||
vil | will
|
||||
bli | become
|
||||
ble | became
|
||||
blei | became *
|
||||
blitt | have become
|
||||
kunne | could
|
||||
inn | in
|
||||
når | when
|
||||
være | be
|
||||
kom | come
|
||||
noen | some
|
||||
noe | some
|
||||
ville | would
|
||||
dere | you
|
||||
som | who/which/that
|
||||
deres | their/theirs
|
||||
kun | only/just
|
||||
ja | yes
|
||||
etter | after
|
||||
ned | down
|
||||
skulle | should
|
||||
denne | this
|
||||
for | for/because
|
||||
deg | you
|
||||
si | hers/his
|
||||
sine | hers/his
|
||||
sitt | hers/his
|
||||
mot | against
|
||||
å | to
|
||||
meget | much
|
||||
hvorfor | why
|
||||
dette | this
|
||||
disse | these/those
|
||||
uten | without
|
||||
hvordan | how
|
||||
ingen | none
|
||||
din | your
|
||||
ditt | your
|
||||
blir | become
|
||||
samme | same
|
||||
hvilken | which
|
||||
hvilke | which (plural)
|
||||
sånn | such a
|
||||
inni | inside/within
|
||||
mellom | between
|
||||
vår | our
|
||||
hver | each
|
||||
hvem | who
|
||||
vors | us/ours
|
||||
hvis | whose
|
||||
både | both
|
||||
bare | only/just
|
||||
enn | than
|
||||
fordi | as/because
|
||||
før | before
|
||||
mange | many
|
||||
også | also
|
||||
slik | just
|
||||
vært | been
|
||||
være | to be
|
||||
båe | both *
|
||||
begge | both
|
||||
siden | since
|
||||
dykk | your *
|
||||
dykkar | yours *
|
||||
dei | they *
|
||||
deira | them *
|
||||
deires | theirs *
|
||||
deim | them *
|
||||
di | your (fem.) *
|
||||
då | as/when *
|
||||
eg | I *
|
||||
ein | a/an *
|
||||
eit | a/an *
|
||||
eitt | a/an *
|
||||
elles | or *
|
||||
honom | he *
|
||||
hjå | at *
|
||||
ho | she *
|
||||
hoe | she *
|
||||
henne | her
|
||||
hennar | her/hers
|
||||
hennes | hers
|
||||
hoss | how *
|
||||
hossen | how *
|
||||
ikkje | not *
|
||||
ingi | noone *
|
||||
inkje | noone *
|
||||
korleis | how *
|
||||
korso | how *
|
||||
kva | what/which *
|
||||
kvar | where *
|
||||
kvarhelst | where *
|
||||
kven | who/whom *
|
||||
kvi | why *
|
||||
kvifor | why *
|
||||
me | we *
|
||||
medan | while *
|
||||
mi | my *
|
||||
mine | my *
|
||||
mykje | much *
|
||||
no | now *
|
||||
nokon | some (masc./neut.) *
|
||||
noka | some (fem.) *
|
||||
nokor | some *
|
||||
noko | some *
|
||||
nokre | some *
|
||||
si | his/hers *
|
||||
sia | since *
|
||||
sidan | since *
|
||||
so | so *
|
||||
somt | some *
|
||||
somme | some *
|
||||
um | about*
|
||||
upp | up *
|
||||
vere | be *
|
||||
vore | was *
|
||||
verte | become *
|
||||
vort | become *
|
||||
varte | became *
|
||||
vart | became *
|
||||
|
||||
@@ -1,251 +0,0 @@
|
||||
| From svn.tartarus.org/snowball/trunk/website/algorithms/portuguese/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
||||
| A Portuguese stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
|
||||
| The following is a ranked list (commonest to rarest) of stopwords
|
||||
| deriving from a large sample of text.
|
||||
|
||||
| Extra words have been added at the end.
|
||||
|
||||
de | of, from
|
||||
a | the; to, at; her
|
||||
o | the; him
|
||||
que | who, that
|
||||
e | and
|
||||
do | de + o
|
||||
da | de + a
|
||||
em | in
|
||||
um | a
|
||||
para | for
|
||||
| é from SER
|
||||
com | with
|
||||
não | not, no
|
||||
uma | a
|
||||
os | the; them
|
||||
no | em + o
|
||||
se | himself etc
|
||||
na | em + a
|
||||
por | for
|
||||
mais | more
|
||||
as | the; them
|
||||
dos | de + os
|
||||
como | as, like
|
||||
mas | but
|
||||
| foi from SER
|
||||
ao | a + o
|
||||
ele | he
|
||||
das | de + as
|
||||
| tem from TER
|
||||
à | a + a
|
||||
seu | his
|
||||
sua | her
|
||||
ou | or
|
||||
| ser from SER
|
||||
quando | when
|
||||
muito | much
|
||||
| há from HAV
|
||||
nos | em + os; us
|
||||
já | already, now
|
||||
| está from EST
|
||||
eu | I
|
||||
também | also
|
||||
só | only, just
|
||||
pelo | per + o
|
||||
pela | per + a
|
||||
até | up to
|
||||
isso | that
|
||||
ela | he
|
||||
entre | between
|
||||
| era from SER
|
||||
depois | after
|
||||
sem | without
|
||||
mesmo | same
|
||||
aos | a + os
|
||||
| ter from TER
|
||||
seus | his
|
||||
quem | whom
|
||||
nas | em + as
|
||||
me | me
|
||||
esse | that
|
||||
eles | they
|
||||
| estão from EST
|
||||
você | you
|
||||
| tinha from TER
|
||||
| foram from SER
|
||||
essa | that
|
||||
num | em + um
|
||||
nem | nor
|
||||
suas | her
|
||||
meu | my
|
||||
às | a + as
|
||||
minha | my
|
||||
| têm from TER
|
||||
numa | em + uma
|
||||
pelos | per + os
|
||||
elas | they
|
||||
| havia from HAV
|
||||
| seja from SER
|
||||
qual | which
|
||||
| será from SER
|
||||
nós | we
|
||||
| tenho from TER
|
||||
lhe | to him, her
|
||||
deles | of them
|
||||
essas | those
|
||||
esses | those
|
||||
pelas | per + as
|
||||
este | this
|
||||
| fosse from SER
|
||||
dele | of him
|
||||
|
||||
| other words. There are many contractions such as naquele = em+aquele,
|
||||
| mo = me+o, but they are rare.
|
||||
| Indefinite article plural forms are also rare.
|
||||
|
||||
tu | thou
|
||||
te | thee
|
||||
vocês | you (plural)
|
||||
vos | you
|
||||
lhes | to them
|
||||
meus | my
|
||||
minhas
|
||||
teu | thy
|
||||
tua
|
||||
teus
|
||||
tuas
|
||||
nosso | our
|
||||
nossa
|
||||
nossos
|
||||
nossas
|
||||
|
||||
dela | of her
|
||||
delas | of them
|
||||
|
||||
esta | this
|
||||
estes | these
|
||||
estas | these
|
||||
aquele | that
|
||||
aquela | that
|
||||
aqueles | those
|
||||
aquelas | those
|
||||
isto | this
|
||||
aquilo | that
|
||||
|
||||
| forms of estar, to be (not including the infinitive):
|
||||
estou
|
||||
está
|
||||
estamos
|
||||
estão
|
||||
estive
|
||||
esteve
|
||||
estivemos
|
||||
estiveram
|
||||
estava
|
||||
estávamos
|
||||
estavam
|
||||
estivera
|
||||
estivéramos
|
||||
esteja
|
||||
estejamos
|
||||
estejam
|
||||
estivesse
|
||||
estivéssemos
|
||||
estivessem
|
||||
estiver
|
||||
estivermos
|
||||
estiverem
|
||||
|
||||
| forms of haver, to have (not including the infinitive):
|
||||
hei
|
||||
há
|
||||
havemos
|
||||
hão
|
||||
houve
|
||||
houvemos
|
||||
houveram
|
||||
houvera
|
||||
houvéramos
|
||||
haja
|
||||
hajamos
|
||||
hajam
|
||||
houvesse
|
||||
houvéssemos
|
||||
houvessem
|
||||
houver
|
||||
houvermos
|
||||
houverem
|
||||
houverei
|
||||
houverá
|
||||
houveremos
|
||||
houverão
|
||||
houveria
|
||||
houveríamos
|
||||
houveriam
|
||||
|
||||
| forms of ser, to be (not including the infinitive):
|
||||
sou
|
||||
somos
|
||||
são
|
||||
era
|
||||
éramos
|
||||
eram
|
||||
fui
|
||||
foi
|
||||
fomos
|
||||
foram
|
||||
fora
|
||||
fôramos
|
||||
seja
|
||||
sejamos
|
||||
sejam
|
||||
fosse
|
||||
fôssemos
|
||||
fossem
|
||||
for
|
||||
formos
|
||||
forem
|
||||
serei
|
||||
será
|
||||
seremos
|
||||
serão
|
||||
seria
|
||||
seríamos
|
||||
seriam
|
||||
|
||||
| forms of ter, to have (not including the infinitive):
|
||||
tenho
|
||||
tem
|
||||
temos
|
||||
tém
|
||||
tinha
|
||||
tínhamos
|
||||
tinham
|
||||
tive
|
||||
teve
|
||||
tivemos
|
||||
tiveram
|
||||
tivera
|
||||
tivéramos
|
||||
tenha
|
||||
tenhamos
|
||||
tenham
|
||||
tivesse
|
||||
tivéssemos
|
||||
tivessem
|
||||
tiver
|
||||
tivermos
|
||||
tiverem
|
||||
terei
|
||||
terá
|
||||
teremos
|
||||
terão
|
||||
teria
|
||||
teríamos
|
||||
teriam
|
||||
@@ -1,233 +0,0 @@
|
||||
# This file was created by Jacques Savoy and is distributed under the BSD license.
|
||||
# See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
# Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
acea
|
||||
aceasta
|
||||
această
|
||||
aceea
|
||||
acei
|
||||
aceia
|
||||
acel
|
||||
acela
|
||||
acele
|
||||
acelea
|
||||
acest
|
||||
acesta
|
||||
aceste
|
||||
acestea
|
||||
aceşti
|
||||
aceştia
|
||||
acolo
|
||||
acum
|
||||
ai
|
||||
aia
|
||||
aibă
|
||||
aici
|
||||
al
|
||||
ăla
|
||||
ale
|
||||
alea
|
||||
ălea
|
||||
altceva
|
||||
altcineva
|
||||
am
|
||||
ar
|
||||
are
|
||||
aş
|
||||
aşadar
|
||||
asemenea
|
||||
asta
|
||||
ăsta
|
||||
astăzi
|
||||
astea
|
||||
ăstea
|
||||
ăştia
|
||||
asupra
|
||||
aţi
|
||||
au
|
||||
avea
|
||||
avem
|
||||
aveţi
|
||||
azi
|
||||
bine
|
||||
bucur
|
||||
bună
|
||||
ca
|
||||
că
|
||||
căci
|
||||
când
|
||||
care
|
||||
cărei
|
||||
căror
|
||||
cărui
|
||||
cât
|
||||
câte
|
||||
câţi
|
||||
către
|
||||
câtva
|
||||
ce
|
||||
cel
|
||||
ceva
|
||||
chiar
|
||||
cînd
|
||||
cine
|
||||
cineva
|
||||
cît
|
||||
cîte
|
||||
cîţi
|
||||
cîtva
|
||||
contra
|
||||
cu
|
||||
cum
|
||||
cumva
|
||||
curând
|
||||
curînd
|
||||
da
|
||||
dă
|
||||
dacă
|
||||
dar
|
||||
datorită
|
||||
de
|
||||
deci
|
||||
deja
|
||||
deoarece
|
||||
departe
|
||||
deşi
|
||||
din
|
||||
dinaintea
|
||||
dintr
|
||||
dintre
|
||||
drept
|
||||
după
|
||||
ea
|
||||
ei
|
||||
el
|
||||
ele
|
||||
eram
|
||||
este
|
||||
eşti
|
||||
eu
|
||||
face
|
||||
fără
|
||||
fi
|
||||
fie
|
||||
fiecare
|
||||
fii
|
||||
fim
|
||||
fiţi
|
||||
iar
|
||||
ieri
|
||||
îi
|
||||
îl
|
||||
îmi
|
||||
împotriva
|
||||
în
|
||||
înainte
|
||||
înaintea
|
||||
încât
|
||||
încît
|
||||
încotro
|
||||
între
|
||||
întrucât
|
||||
întrucît
|
||||
îţi
|
||||
la
|
||||
lângă
|
||||
le
|
||||
li
|
||||
lîngă
|
||||
lor
|
||||
lui
|
||||
mă
|
||||
mâine
|
||||
mea
|
||||
mei
|
||||
mele
|
||||
mereu
|
||||
meu
|
||||
mi
|
||||
mine
|
||||
mult
|
||||
multă
|
||||
mulţi
|
||||
ne
|
||||
nicăieri
|
||||
nici
|
||||
nimeni
|
||||
nişte
|
||||
noastră
|
||||
noastre
|
||||
noi
|
||||
noştri
|
||||
nostru
|
||||
nu
|
||||
ori
|
||||
oricând
|
||||
oricare
|
||||
oricât
|
||||
orice
|
||||
oricînd
|
||||
oricine
|
||||
oricît
|
||||
oricum
|
||||
oriunde
|
||||
până
|
||||
pe
|
||||
pentru
|
||||
peste
|
||||
pînă
|
||||
poate
|
||||
pot
|
||||
prea
|
||||
prima
|
||||
primul
|
||||
prin
|
||||
printr
|
||||
sa
|
||||
să
|
||||
săi
|
||||
sale
|
||||
sau
|
||||
său
|
||||
se
|
||||
şi
|
||||
sînt
|
||||
sîntem
|
||||
sînteţi
|
||||
spre
|
||||
sub
|
||||
sunt
|
||||
suntem
|
||||
sunteţi
|
||||
ta
|
||||
tăi
|
||||
tale
|
||||
tău
|
||||
te
|
||||
ţi
|
||||
ţie
|
||||
tine
|
||||
toată
|
||||
toate
|
||||
tot
|
||||
toţi
|
||||
totuşi
|
||||
tu
|
||||
un
|
||||
una
|
||||
unde
|
||||
undeva
|
||||
unei
|
||||
unele
|
||||
uneori
|
||||
unor
|
||||
vă
|
||||
vi
|
||||
voastră
|
||||
voastre
|
||||
voi
|
||||
voştri
|
||||
vostru
|
||||
vouă
|
||||
vreo
|
||||
vreun
|
||||
@@ -1,241 +0,0 @@
|
||||
| From svn.tartarus.org/snowball/trunk/website/algorithms/russian/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
||||
| a russian stop word list. comments begin with vertical bar. each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
| this is a ranked list (commonest to rarest) of stopwords derived from
|
||||
| a large text sample.
|
||||
|
||||
| letter `ё' is translated to `е'.
|
||||
|
||||
и | and
|
||||
в | in/into
|
||||
во | alternative form
|
||||
не | not
|
||||
что | what/that
|
||||
он | he
|
||||
на | on/onto
|
||||
я | i
|
||||
с | from
|
||||
со | alternative form
|
||||
как | how
|
||||
а | milder form of `no' (but)
|
||||
то | conjunction and form of `that'
|
||||
все | all
|
||||
она | she
|
||||
так | so, thus
|
||||
его | him
|
||||
но | but
|
||||
да | yes/and
|
||||
ты | thou
|
||||
к | towards, by
|
||||
у | around, chez
|
||||
же | intensifier particle
|
||||
вы | you
|
||||
за | beyond, behind
|
||||
бы | conditional/subj. particle
|
||||
по | up to, along
|
||||
только | only
|
||||
ее | her
|
||||
мне | to me
|
||||
было | it was
|
||||
вот | here is/are, particle
|
||||
от | away from
|
||||
меня | me
|
||||
еще | still, yet, more
|
||||
нет | no, there isnt/arent
|
||||
о | about
|
||||
из | out of
|
||||
ему | to him
|
||||
теперь | now
|
||||
когда | when
|
||||
даже | even
|
||||
ну | so, well
|
||||
вдруг | suddenly
|
||||
ли | interrogative particle
|
||||
если | if
|
||||
уже | already, but homonym of `narrower'
|
||||
или | or
|
||||
ни | neither
|
||||
быть | to be
|
||||
был | he was
|
||||
него | prepositional form of его
|
||||
до | up to
|
||||
вас | you accusative
|
||||
нибудь | indef. suffix preceded by hyphen
|
||||
опять | again
|
||||
уж | already, but homonym of `adder'
|
||||
вам | to you
|
||||
сказал | he said
|
||||
ведь | particle `after all'
|
||||
там | there
|
||||
потом | then
|
||||
себя | oneself
|
||||
ничего | nothing
|
||||
ей | to her
|
||||
может | usually with `быть' as `maybe'
|
||||
они | they
|
||||
тут | here
|
||||
где | where
|
||||
есть | there is/are
|
||||
надо | got to, must
|
||||
ней | prepositional form of ей
|
||||
для | for
|
||||
мы | we
|
||||
тебя | thee
|
||||
их | them, their
|
||||
чем | than
|
||||
была | she was
|
||||
сам | self
|
||||
чтоб | in order to
|
||||
без | without
|
||||
будто | as if
|
||||
человек | man, person, one
|
||||
чего | genitive form of `what'
|
||||
раз | once
|
||||
тоже | also
|
||||
себе | to oneself
|
||||
под | beneath
|
||||
жизнь | life
|
||||
будет | will be
|
||||
ж | short form of intensifer particle `же'
|
||||
тогда | then
|
||||
кто | who
|
||||
этот | this
|
||||
говорил | was saying
|
||||
того | genitive form of `that'
|
||||
потому | for that reason
|
||||
этого | genitive form of `this'
|
||||
какой | which
|
||||
совсем | altogether
|
||||
ним | prepositional form of `его', `они'
|
||||
здесь | here
|
||||
этом | prepositional form of `этот'
|
||||
один | one
|
||||
почти | almost
|
||||
мой | my
|
||||
тем | instrumental/dative plural of `тот', `то'
|
||||
чтобы | full form of `in order that'
|
||||
нее | her (acc.)
|
||||
кажется | it seems
|
||||
сейчас | now
|
||||
были | they were
|
||||
куда | where to
|
||||
зачем | why
|
||||
сказать | to say
|
||||
всех | all (acc., gen. preposn. plural)
|
||||
никогда | never
|
||||
сегодня | today
|
||||
можно | possible, one can
|
||||
при | by
|
||||
наконец | finally
|
||||
два | two
|
||||
об | alternative form of `о', about
|
||||
другой | another
|
||||
хоть | even
|
||||
после | after
|
||||
над | above
|
||||
больше | more
|
||||
тот | that one (masc.)
|
||||
через | across, in
|
||||
эти | these
|
||||
нас | us
|
||||
про | about
|
||||
всего | in all, only, of all
|
||||
них | prepositional form of `они' (they)
|
||||
какая | which, feminine
|
||||
много | lots
|
||||
разве | interrogative particle
|
||||
сказала | she said
|
||||
три | three
|
||||
эту | this, acc. fem. sing.
|
||||
моя | my, feminine
|
||||
впрочем | moreover, besides
|
||||
хорошо | good
|
||||
свою | ones own, acc. fem. sing.
|
||||
этой | oblique form of `эта', fem. `this'
|
||||
перед | in front of
|
||||
иногда | sometimes
|
||||
лучше | better
|
||||
чуть | a little
|
||||
том | preposn. form of `that one'
|
||||
нельзя | one must not
|
||||
такой | such a one
|
||||
им | to them
|
||||
более | more
|
||||
всегда | always
|
||||
конечно | of course
|
||||
всю | acc. fem. sing of `all'
|
||||
между | between
|
||||
|
||||
|
||||
| b: some paradigms
|
||||
|
|
||||
| personal pronouns
|
||||
|
|
||||
| я меня мне мной [мною]
|
||||
| ты тебя тебе тобой [тобою]
|
||||
| он его ему им [него, нему, ним]
|
||||
| она ее эи ею [нее, нэи, нею]
|
||||
| оно его ему им [него, нему, ним]
|
||||
|
|
||||
| мы нас нам нами
|
||||
| вы вас вам вами
|
||||
| они их им ими [них, ним, ними]
|
||||
|
|
||||
| себя себе собой [собою]
|
||||
|
|
||||
| demonstrative pronouns: этот (this), тот (that)
|
||||
|
|
||||
| этот эта это эти
|
||||
| этого эты это эти
|
||||
| этого этой этого этих
|
||||
| этому этой этому этим
|
||||
| этим этой этим [этою] этими
|
||||
| этом этой этом этих
|
||||
|
|
||||
| тот та то те
|
||||
| того ту то те
|
||||
| того той того тех
|
||||
| тому той тому тем
|
||||
| тем той тем [тою] теми
|
||||
| том той том тех
|
||||
|
|
||||
| determinative pronouns
|
||||
|
|
||||
| (a) весь (all)
|
||||
|
|
||||
| весь вся все все
|
||||
| всего всю все все
|
||||
| всего всей всего всех
|
||||
| всему всей всему всем
|
||||
| всем всей всем [всею] всеми
|
||||
| всем всей всем всех
|
||||
|
|
||||
| (b) сам (himself etc)
|
||||
|
|
||||
| сам сама само сами
|
||||
| самого саму само самих
|
||||
| самого самой самого самих
|
||||
| самому самой самому самим
|
||||
| самим самой самим [самою] самими
|
||||
| самом самой самом самих
|
||||
|
|
||||
| stems of verbs `to be', `to have', `to do' and modal
|
||||
|
|
||||
| быть бы буд быв есть суть
|
||||
| име
|
||||
| дел
|
||||
| мог мож мочь
|
||||
| уме
|
||||
| хоч хот
|
||||
| долж
|
||||
| можн
|
||||
| нужн
|
||||
| нельзя
|
||||
|
||||
@@ -1,131 +0,0 @@
|
||||
| From svn.tartarus.org/snowball/trunk/website/algorithms/swedish/stop.txt
|
||||
| This file is distributed under the BSD License.
|
||||
| See http://snowball.tartarus.org/license.php
|
||||
| Also see http://www.opensource.org/licenses/bsd-license.html
|
||||
| - Encoding was converted to UTF-8.
|
||||
| - This notice was added.
|
||||
|
||||
| A Swedish stop word list. Comments begin with vertical bar. Each stop
|
||||
| word is at the start of a line.
|
||||
|
||||
| This is a ranked list (commonest to rarest) of stopwords derived from
|
||||
| a large text sample.
|
||||
|
||||
| Swedish stop words occasionally exhibit homonym clashes. For example
|
||||
| så = so, but also seed. These are indicated clearly below.
|
||||
|
||||
och | and
|
||||
det | it, this/that
|
||||
att | to (with infinitive)
|
||||
i | in, at
|
||||
en | a
|
||||
jag | I
|
||||
hon | she
|
||||
som | who, that
|
||||
han | he
|
||||
på | on
|
||||
den | it, this/that
|
||||
med | with
|
||||
var | where, each
|
||||
sig | him(self) etc
|
||||
för | for
|
||||
så | so (also: seed)
|
||||
till | to
|
||||
är | is
|
||||
men | but
|
||||
ett | a
|
||||
om | if; around, about
|
||||
hade | had
|
||||
de | they, these/those
|
||||
av | of
|
||||
icke | not, no
|
||||
mig | me
|
||||
du | you
|
||||
henne | her
|
||||
då | then, when
|
||||
sin | his
|
||||
nu | now
|
||||
har | have
|
||||
inte | inte någon = no one
|
||||
hans | his
|
||||
honom | him
|
||||
skulle | 'sake'
|
||||
hennes | her
|
||||
där | there
|
||||
min | my
|
||||
man | one (pronoun)
|
||||
ej | nor
|
||||
vid | at, by, on (also: vast)
|
||||
kunde | could
|
||||
något | some etc
|
||||
från | from, off
|
||||
ut | out
|
||||
när | when
|
||||
efter | after, behind
|
||||
upp | up
|
||||
vi | we
|
||||
dem | them
|
||||
vara | be
|
||||
vad | what
|
||||
över | over
|
||||
än | than
|
||||
dig | you
|
||||
kan | can
|
||||
sina | his
|
||||
här | here
|
||||
ha | have
|
||||
mot | towards
|
||||
alla | all
|
||||
under | under (also: wonder)
|
||||
någon | some etc
|
||||
eller | or (else)
|
||||
allt | all
|
||||
mycket | much
|
||||
sedan | since
|
||||
ju | why
|
||||
denna | this/that
|
||||
själv | myself, yourself etc
|
||||
detta | this/that
|
||||
åt | to
|
||||
utan | without
|
||||
varit | was
|
||||
hur | how
|
||||
ingen | no
|
||||
mitt | my
|
||||
ni | you
|
||||
bli | to be, become
|
||||
blev | from bli
|
||||
oss | us
|
||||
din | thy
|
||||
dessa | these/those
|
||||
några | some etc
|
||||
deras | their
|
||||
blir | from bli
|
||||
mina | my
|
||||
samma | (the) same
|
||||
vilken | who, that
|
||||
er | you, your
|
||||
sådan | such a
|
||||
vår | our
|
||||
blivit | from bli
|
||||
dess | its
|
||||
inom | within
|
||||
mellan | between
|
||||
sådant | such a
|
||||
varför | why
|
||||
varje | each
|
||||
vilka | who, that
|
||||
ditt | thy
|
||||
vem | who
|
||||
vilket | who, that
|
||||
sitta | his
|
||||
sådana | such a
|
||||
vart | each
|
||||
dina | thy
|
||||
vars | whose
|
||||
vårt | our
|
||||
våra | our
|
||||
ert | your
|
||||
era | your
|
||||
vilkas | whose
|
||||
|
||||
@@ -1,119 +0,0 @@
|
||||
# Thai stopwords from:
|
||||
# "Opinion Detection in Thai Political News Columns
|
||||
# Based on Subjectivity Analysis"
|
||||
# Khampol Sukhum, Supot Nitsuwat, and Choochart Haruechaiyasak
|
||||
ไว้
|
||||
ไม่
|
||||
ไป
|
||||
ได้
|
||||
ให้
|
||||
ใน
|
||||
โดย
|
||||
แห่ง
|
||||
แล้ว
|
||||
และ
|
||||
แรก
|
||||
แบบ
|
||||
แต่
|
||||
เอง
|
||||
เห็น
|
||||
เลย
|
||||
เริ่ม
|
||||
เรา
|
||||
เมื่อ
|
||||
เพื่อ
|
||||
เพราะ
|
||||
เป็นการ
|
||||
เป็น
|
||||
เปิดเผย
|
||||
เปิด
|
||||
เนื่องจาก
|
||||
เดียวกัน
|
||||
เดียว
|
||||
เช่น
|
||||
เฉพาะ
|
||||
เคย
|
||||
เข้า
|
||||
เขา
|
||||
อีก
|
||||
อาจ
|
||||
อะไร
|
||||
ออก
|
||||
อย่าง
|
||||
อยู่
|
||||
อยาก
|
||||
หาก
|
||||
หลาย
|
||||
หลังจาก
|
||||
หลัง
|
||||
หรือ
|
||||
หนึ่ง
|
||||
ส่วน
|
||||
ส่ง
|
||||
สุด
|
||||
สําหรับ
|
||||
ว่า
|
||||
วัน
|
||||
ลง
|
||||
ร่วม
|
||||
ราย
|
||||
รับ
|
||||
ระหว่าง
|
||||
รวม
|
||||
ยัง
|
||||
มี
|
||||
มาก
|
||||
มา
|
||||
พร้อม
|
||||
พบ
|
||||
ผ่าน
|
||||
ผล
|
||||
บาง
|
||||
น่า
|
||||
นี้
|
||||
นํา
|
||||
นั้น
|
||||
นัก
|
||||
นอกจาก
|
||||
ทุก
|
||||
ที่สุด
|
||||
ที่
|
||||
ทําให้
|
||||
ทํา
|
||||
ทาง
|
||||
ทั้งนี้
|
||||
ทั้ง
|
||||
ถ้า
|
||||
ถูก
|
||||
ถึง
|
||||
ต้อง
|
||||
ต่างๆ
|
||||
ต่าง
|
||||
ต่อ
|
||||
ตาม
|
||||
ตั้งแต่
|
||||
ตั้ง
|
||||
ด้าน
|
||||
ด้วย
|
||||
ดัง
|
||||
ซึ่ง
|
||||
ช่วง
|
||||
จึง
|
||||
จาก
|
||||
จัด
|
||||
จะ
|
||||
คือ
|
||||
ความ
|
||||
ครั้ง
|
||||
คง
|
||||
ขึ้น
|
||||
ของ
|
||||
ขอ
|
||||
ขณะ
|
||||
ก่อน
|
||||
ก็
|
||||
การ
|
||||
กับ
|
||||
กัน
|
||||
กว่า
|
||||
กล่าว
|
||||
@@ -1,212 +0,0 @@
|
||||
# Turkish stopwords from LUCENE-559
|
||||
# merged with the list from "Information Retrieval on Turkish Texts"
|
||||
# (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf)
|
||||
acaba
|
||||
altmış
|
||||
altı
|
||||
ama
|
||||
ancak
|
||||
arada
|
||||
aslında
|
||||
ayrıca
|
||||
bana
|
||||
bazı
|
||||
belki
|
||||
ben
|
||||
benden
|
||||
beni
|
||||
benim
|
||||
beri
|
||||
beş
|
||||
bile
|
||||
bin
|
||||
bir
|
||||
birçok
|
||||
biri
|
||||
birkaç
|
||||
birkez
|
||||
birşey
|
||||
birşeyi
|
||||
biz
|
||||
bize
|
||||
bizden
|
||||
bizi
|
||||
bizim
|
||||
böyle
|
||||
böylece
|
||||
bu
|
||||
buna
|
||||
bunda
|
||||
bundan
|
||||
bunlar
|
||||
bunları
|
||||
bunların
|
||||
bunu
|
||||
bunun
|
||||
burada
|
||||
çok
|
||||
çünkü
|
||||
da
|
||||
daha
|
||||
dahi
|
||||
de
|
||||
defa
|
||||
değil
|
||||
diğer
|
||||
diye
|
||||
doksan
|
||||
dokuz
|
||||
dolayı
|
||||
dolayısıyla
|
||||
dört
|
||||
edecek
|
||||
eden
|
||||
ederek
|
||||
edilecek
|
||||
ediliyor
|
||||
edilmesi
|
||||
ediyor
|
||||
eğer
|
||||
elli
|
||||
en
|
||||
etmesi
|
||||
etti
|
||||
ettiği
|
||||
ettiğini
|
||||
gibi
|
||||
göre
|
||||
halen
|
||||
hangi
|
||||
hatta
|
||||
hem
|
||||
henüz
|
||||
hep
|
||||
hepsi
|
||||
her
|
||||
herhangi
|
||||
herkesin
|
||||
hiç
|
||||
hiçbir
|
||||
için
|
||||
iki
|
||||
ile
|
||||
ilgili
|
||||
ise
|
||||
işte
|
||||
itibaren
|
||||
itibariyle
|
||||
kadar
|
||||
karşın
|
||||
katrilyon
|
||||
kendi
|
||||
kendilerine
|
||||
kendini
|
||||
kendisi
|
||||
kendisine
|
||||
kendisini
|
||||
kez
|
||||
ki
|
||||
kim
|
||||
kimden
|
||||
kime
|
||||
kimi
|
||||
kimse
|
||||
kırk
|
||||
milyar
|
||||
milyon
|
||||
mu
|
||||
mü
|
||||
mı
|
||||
nasıl
|
||||
ne
|
||||
neden
|
||||
nedenle
|
||||
nerde
|
||||
nerede
|
||||
nereye
|
||||
niye
|
||||
niçin
|
||||
o
|
||||
olan
|
||||
olarak
|
||||
oldu
|
||||
olduğu
|
||||
olduğunu
|
||||
olduklarını
|
||||
olmadı
|
||||
olmadığı
|
||||
olmak
|
||||
olması
|
||||
olmayan
|
||||
olmaz
|
||||
olsa
|
||||
olsun
|
||||
olup
|
||||
olur
|
||||
olursa
|
||||
oluyor
|
||||
on
|
||||
ona
|
||||
ondan
|
||||
onlar
|
||||
onlardan
|
||||
onları
|
||||
onların
|
||||
onu
|
||||
onun
|
||||
otuz
|
||||
oysa
|
||||
öyle
|
||||
pek
|
||||
rağmen
|
||||
sadece
|
||||
sanki
|
||||
sekiz
|
||||
seksen
|
||||
sen
|
||||
senden
|
||||
seni
|
||||
senin
|
||||
siz
|
||||
sizden
|
||||
sizi
|
||||
sizin
|
||||
şey
|
||||
şeyden
|
||||
şeyi
|
||||
şeyler
|
||||
şöyle
|
||||
şu
|
||||
şuna
|
||||
şunda
|
||||
şundan
|
||||
şunları
|
||||
şunu
|
||||
tarafından
|
||||
trilyon
|
||||
tüm
|
||||
üç
|
||||
üzere
|
||||
var
|
||||
vardı
|
||||
ve
|
||||
veya
|
||||
ya
|
||||
yani
|
||||
yapacak
|
||||
yapılan
|
||||
yapılması
|
||||
yapıyor
|
||||
yapmak
|
||||
yaptı
|
||||
yaptığı
|
||||
yaptığını
|
||||
yaptıkları
|
||||
yedi
|
||||
yerine
|
||||
yetmiş
|
||||
yine
|
||||
yirmi
|
||||
yoksa
|
||||
yüz
|
||||
zaten
|
||||
@@ -1,29 +0,0 @@
|
||||
#
|
||||
# This is a sample user dictionary for Kuromoji (JapaneseTokenizer)
|
||||
#
|
||||
# Add entries to this file in order to override the statistical model in terms
|
||||
# of segmentation, readings and part-of-speech tags. Notice that entries do
|
||||
# not have weights since they are always used when found. This is by-design
|
||||
# in order to maximize ease-of-use.
|
||||
#
|
||||
# Entries are defined using the following CSV format:
|
||||
# <text>,<token 1> ... <token n>,<reading 1> ... <reading n>,<part-of-speech tag>
|
||||
#
|
||||
# Notice that a single half-width space separates tokens and readings, and
|
||||
# that the number tokens and readings must match exactly.
|
||||
#
|
||||
# Also notice that multiple entries with the same <text> is undefined.
|
||||
#
|
||||
# Whitespace only lines are ignored. Comments are not allowed on entry lines.
|
||||
#
|
||||
|
||||
# Custom segmentation for kanji compounds
|
||||
日本経済新聞,日本 経済 新聞,ニホン ケイザイ シンブン,カスタム名詞
|
||||
関西国際空港,関西 国際 空港,カンサイ コクサイ クウコウ,カスタム名詞
|
||||
|
||||
# Custom segmentation for compound katakana
|
||||
トートバッグ,トート バッグ,トート バッグ,かずカナ名詞
|
||||
ショルダーバッグ,ショルダー バッグ,ショルダー バッグ,かずカナ名詞
|
||||
|
||||
# Custom reading for former sumo wrestler
|
||||
朝青龍,朝青龍,アサショウリュウ,カスタム人名
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,246 +0,0 @@
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Syntax:
|
||||
# "source" => "target"
|
||||
# "source".length() > 0 (source cannot be empty.)
|
||||
# "target".length() >= 0 (target can be empty.)
|
||||
|
||||
# example:
|
||||
# "À" => "A"
|
||||
# "\u00C0" => "A"
|
||||
# "\u00C0" => "\u0041"
|
||||
# "ß" => "ss"
|
||||
# "\t" => " "
|
||||
# "\n" => ""
|
||||
|
||||
# À => A
|
||||
"\u00C0" => "A"
|
||||
|
||||
# Á => A
|
||||
"\u00C1" => "A"
|
||||
|
||||
# Â => A
|
||||
"\u00C2" => "A"
|
||||
|
||||
# Ã => A
|
||||
"\u00C3" => "A"
|
||||
|
||||
# Ä => A
|
||||
"\u00C4" => "A"
|
||||
|
||||
# Å => A
|
||||
"\u00C5" => "A"
|
||||
|
||||
# Æ => AE
|
||||
"\u00C6" => "AE"
|
||||
|
||||
# Ç => C
|
||||
"\u00C7" => "C"
|
||||
|
||||
# È => E
|
||||
"\u00C8" => "E"
|
||||
|
||||
# É => E
|
||||
"\u00C9" => "E"
|
||||
|
||||
# Ê => E
|
||||
"\u00CA" => "E"
|
||||
|
||||
# Ë => E
|
||||
"\u00CB" => "E"
|
||||
|
||||
# Ì => I
|
||||
"\u00CC" => "I"
|
||||
|
||||
# Í => I
|
||||
"\u00CD" => "I"
|
||||
|
||||
# Î => I
|
||||
"\u00CE" => "I"
|
||||
|
||||
# Ï => I
|
||||
"\u00CF" => "I"
|
||||
|
||||
# IJ => IJ
|
||||
"\u0132" => "IJ"
|
||||
|
||||
# Ð => D
|
||||
"\u00D0" => "D"
|
||||
|
||||
# Ñ => N
|
||||
"\u00D1" => "N"
|
||||
|
||||
# Ò => O
|
||||
"\u00D2" => "O"
|
||||
|
||||
# Ó => O
|
||||
"\u00D3" => "O"
|
||||
|
||||
# Ô => O
|
||||
"\u00D4" => "O"
|
||||
|
||||
# Õ => O
|
||||
"\u00D5" => "O"
|
||||
|
||||
# Ö => O
|
||||
"\u00D6" => "O"
|
||||
|
||||
# Ø => O
|
||||
"\u00D8" => "O"
|
||||
|
||||
# Œ => OE
|
||||
"\u0152" => "OE"
|
||||
|
||||
# Þ
|
||||
"\u00DE" => "TH"
|
||||
|
||||
# Ù => U
|
||||
"\u00D9" => "U"
|
||||
|
||||
# Ú => U
|
||||
"\u00DA" => "U"
|
||||
|
||||
# Û => U
|
||||
"\u00DB" => "U"
|
||||
|
||||
# Ü => U
|
||||
"\u00DC" => "U"
|
||||
|
||||
# Ý => Y
|
||||
"\u00DD" => "Y"
|
||||
|
||||
# Ÿ => Y
|
||||
"\u0178" => "Y"
|
||||
|
||||
# à => a
|
||||
"\u00E0" => "a"
|
||||
|
||||
# á => a
|
||||
"\u00E1" => "a"
|
||||
|
||||
# â => a
|
||||
"\u00E2" => "a"
|
||||
|
||||
# ã => a
|
||||
"\u00E3" => "a"
|
||||
|
||||
# ä => a
|
||||
"\u00E4" => "a"
|
||||
|
||||
# å => a
|
||||
"\u00E5" => "a"
|
||||
|
||||
# æ => ae
|
||||
"\u00E6" => "ae"
|
||||
|
||||
# ç => c
|
||||
"\u00E7" => "c"
|
||||
|
||||
# è => e
|
||||
"\u00E8" => "e"
|
||||
|
||||
# é => e
|
||||
"\u00E9" => "e"
|
||||
|
||||
# ê => e
|
||||
"\u00EA" => "e"
|
||||
|
||||
# ë => e
|
||||
"\u00EB" => "e"
|
||||
|
||||
# ì => i
|
||||
"\u00EC" => "i"
|
||||
|
||||
# í => i
|
||||
"\u00ED" => "i"
|
||||
|
||||
# î => i
|
||||
"\u00EE" => "i"
|
||||
|
||||
# ï => i
|
||||
"\u00EF" => "i"
|
||||
|
||||
# ij => ij
|
||||
"\u0133" => "ij"
|
||||
|
||||
# ð => d
|
||||
"\u00F0" => "d"
|
||||
|
||||
# ñ => n
|
||||
"\u00F1" => "n"
|
||||
|
||||
# ò => o
|
||||
"\u00F2" => "o"
|
||||
|
||||
# ó => o
|
||||
"\u00F3" => "o"
|
||||
|
||||
# ô => o
|
||||
"\u00F4" => "o"
|
||||
|
||||
# õ => o
|
||||
"\u00F5" => "o"
|
||||
|
||||
# ö => o
|
||||
"\u00F6" => "o"
|
||||
|
||||
# ø => o
|
||||
"\u00F8" => "o"
|
||||
|
||||
# œ => oe
|
||||
"\u0153" => "oe"
|
||||
|
||||
# ß => ss
|
||||
"\u00DF" => "ss"
|
||||
|
||||
# þ => th
|
||||
"\u00FE" => "th"
|
||||
|
||||
# ù => u
|
||||
"\u00F9" => "u"
|
||||
|
||||
# ú => u
|
||||
"\u00FA" => "u"
|
||||
|
||||
# û => u
|
||||
"\u00FB" => "u"
|
||||
|
||||
# ü => u
|
||||
"\u00FC" => "u"
|
||||
|
||||
# ý => y
|
||||
"\u00FD" => "y"
|
||||
|
||||
# ÿ => y
|
||||
"\u00FF" => "y"
|
||||
|
||||
# ff => ff
|
||||
"\uFB00" => "ff"
|
||||
|
||||
# fi => fi
|
||||
"\uFB01" => "fi"
|
||||
|
||||
# fl => fl
|
||||
"\uFB02" => "fl"
|
||||
|
||||
# ffi => ffi
|
||||
"\uFB03" => "ffi"
|
||||
|
||||
# ffl => ffl
|
||||
"\uFB04" => "ffl"
|
||||
|
||||
# ſt => ft
|
||||
"\uFB05" => "ft"
|
||||
|
||||
# st => st
|
||||
"\uFB06" => "st"
|
||||
@@ -1,21 +0,0 @@
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
#-----------------------------------------------------------------------
|
||||
# Use a protected word file to protect against the stemmer reducing two
|
||||
# unrelated words to the same base word.
|
||||
|
||||
# Some non-words that normally won't be encountered,
|
||||
# just to test that they won't be stemmed.
|
||||
dontstems
|
||||
zwhacky
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,24 +0,0 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
user=
|
||||
solr_hostname=localhost
|
||||
solr_port=8983
|
||||
rsyncd_port=18983
|
||||
data_dir=
|
||||
webapp_name=solr
|
||||
master_host=
|
||||
master_data_dir=
|
||||
master_status_dir=
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,2 +0,0 @@
|
||||
pizza
|
||||
history
|
||||
@@ -1,14 +0,0 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
@@ -1,29 +0,0 @@
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
#-----------------------------------------------------------------------
|
||||
#some test synonym mappings unlikely to appear in real input text
|
||||
aaafoo => aaabar
|
||||
bbbfoo => bbbfoo bbbbar
|
||||
cccfoo => cccbar cccbaz
|
||||
fooaaa,baraaa,bazaaa
|
||||
|
||||
# Some synonym groups specific to this example
|
||||
GB,gib,gigabyte,gigabytes
|
||||
MB,mib,megabyte,megabytes
|
||||
Television, Televisions, TV, TVs
|
||||
#notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
|
||||
#after us won't split it into two words.
|
||||
|
||||
# Synonym mappings can be used for spelling correction too
|
||||
pixima => pixma
|
||||
|
||||
@@ -1,53 +0,0 @@
|
||||
/*
|
||||
This is a basic skeleton JavaScript update processor.
|
||||
|
||||
In order for this to be executed, it must be properly wired into solrconfig.xml; by default it is commented out in
|
||||
the example solrconfig.xml and must be uncommented to be enabled.
|
||||
|
||||
See http://wiki.apache.org/solr/ScriptUpdateProcessor for more details.
|
||||
*/
|
||||
|
||||
function processAdd(cmd) {
|
||||
|
||||
doc = cmd.solrDoc; // org.apache.solr.common.SolrInputDocument
|
||||
id = doc.getFieldValue("id");
|
||||
logger.info("update-script#processAdd: id=" + id);
|
||||
|
||||
// Set a field value:
|
||||
// doc.setField("foo_s", "whatever");
|
||||
|
||||
// Get a configuration parameter:
|
||||
// config_param = params.get('config_param'); // "params" only exists if processor configured with <lst name="params">
|
||||
|
||||
// Get a request parameter:
|
||||
// some_param = req.getParams().get("some_param")
|
||||
|
||||
// Add a field of field names that match a pattern:
|
||||
// - Potentially useful to determine the fields/attributes represented in a result set, via faceting on field_name_ss
|
||||
// field_names = doc.getFieldNames().toArray();
|
||||
// for(i=0; i < field_names.length; i++) {
|
||||
// field_name = field_names[i];
|
||||
// if (/attr_.*/.test(field_name)) { doc.addField("attribute_ss", field_names[i]); }
|
||||
// }
|
||||
|
||||
}
|
||||
|
||||
function processDelete(cmd) {
|
||||
// no-op
|
||||
}
|
||||
|
||||
function processMergeIndexes(cmd) {
|
||||
// no-op
|
||||
}
|
||||
|
||||
function processCommit(cmd) {
|
||||
// no-op
|
||||
}
|
||||
|
||||
function processRollback(cmd) {
|
||||
// no-op
|
||||
}
|
||||
|
||||
function finish() {
|
||||
// no-op
|
||||
}
|
||||
@@ -1,132 +0,0 @@
|
||||
<?xml version='1.0' encoding='UTF-8'?>
|
||||
|
||||
<!--
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
-->
|
||||
|
||||
<!--
|
||||
Simple transform of Solr query results to HTML
|
||||
-->
|
||||
<xsl:stylesheet version='1.0'
|
||||
xmlns:xsl='http://www.w3.org/1999/XSL/Transform'
|
||||
>
|
||||
|
||||
<xsl:output media-type="text/html" encoding="UTF-8"/>
|
||||
|
||||
<xsl:variable name="title" select="concat('Solr search results (',response/result/@numFound,' documents)')"/>
|
||||
|
||||
<xsl:template match='/'>
|
||||
<html>
|
||||
<head>
|
||||
<title><xsl:value-of select="$title"/></title>
|
||||
<xsl:call-template name="css"/>
|
||||
</head>
|
||||
<body>
|
||||
<h1><xsl:value-of select="$title"/></h1>
|
||||
<div class="note">
|
||||
This has been formatted by the sample "example.xsl" transform -
|
||||
use your own XSLT to get a nicer page
|
||||
</div>
|
||||
<xsl:apply-templates select="response/result/doc"/>
|
||||
</body>
|
||||
</html>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="doc">
|
||||
<xsl:variable name="pos" select="position()"/>
|
||||
<div class="doc">
|
||||
<table width="100%">
|
||||
<xsl:apply-templates>
|
||||
<xsl:with-param name="pos"><xsl:value-of select="$pos"/></xsl:with-param>
|
||||
</xsl:apply-templates>
|
||||
</table>
|
||||
</div>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="doc/*[@name='score']" priority="100">
|
||||
<xsl:param name="pos"></xsl:param>
|
||||
<tr>
|
||||
<td class="name">
|
||||
<xsl:value-of select="@name"/>
|
||||
</td>
|
||||
<td class="value">
|
||||
<xsl:value-of select="."/>
|
||||
|
||||
<xsl:if test="boolean(//lst[@name='explain'])">
|
||||
<xsl:element name="a">
|
||||
<!-- can't allow whitespace here -->
|
||||
<xsl:attribute name="href">javascript:toggle("<xsl:value-of select="concat('exp-',$pos)" />");</xsl:attribute>?</xsl:element>
|
||||
<br/>
|
||||
<xsl:element name="div">
|
||||
<xsl:attribute name="class">exp</xsl:attribute>
|
||||
<xsl:attribute name="id">
|
||||
<xsl:value-of select="concat('exp-',$pos)" />
|
||||
</xsl:attribute>
|
||||
<xsl:value-of select="//lst[@name='explain']/str[position()=$pos]"/>
|
||||
</xsl:element>
|
||||
</xsl:if>
|
||||
</td>
|
||||
</tr>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="doc/arr" priority="100">
|
||||
<tr>
|
||||
<td class="name">
|
||||
<xsl:value-of select="@name"/>
|
||||
</td>
|
||||
<td class="value">
|
||||
<ul>
|
||||
<xsl:for-each select="*">
|
||||
<li><xsl:value-of select="."/></li>
|
||||
</xsl:for-each>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</xsl:template>
|
||||
|
||||
|
||||
<xsl:template match="doc/*">
|
||||
<tr>
|
||||
<td class="name">
|
||||
<xsl:value-of select="@name"/>
|
||||
</td>
|
||||
<td class="value">
|
||||
<xsl:value-of select="."/>
|
||||
</td>
|
||||
</tr>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="*"/>
|
||||
|
||||
<xsl:template name="css">
|
||||
<script>
|
||||
function toggle(id) {
|
||||
var obj = document.getElementById(id);
|
||||
obj.style.display = (obj.style.display != 'block') ? 'block' : 'none';
|
||||
}
|
||||
</script>
|
||||
<style type="text/css">
|
||||
body { font-family: "Lucida Grande", sans-serif }
|
||||
td.name { font-style: italic; font-size:80%; }
|
||||
td { vertical-align: top; }
|
||||
ul { margin: 0px; margin-left: 1em; padding: 0px; }
|
||||
.note { font-size:80%; }
|
||||
.doc { margin-top: 1em; border-top: solid grey 1px; }
|
||||
.exp { display: none; font-family: monospace; white-space: pre; }
|
||||
</style>
|
||||
</xsl:template>
|
||||
|
||||
</xsl:stylesheet>
|
||||
@@ -1,67 +0,0 @@
|
||||
<?xml version='1.0' encoding='UTF-8'?>
|
||||
|
||||
<!--
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
-->
|
||||
|
||||
<!--
|
||||
Simple transform of Solr query results to Atom
|
||||
-->
|
||||
|
||||
<xsl:stylesheet version='1.0'
|
||||
xmlns:xsl='http://www.w3.org/1999/XSL/Transform'>
|
||||
|
||||
<xsl:output
|
||||
method="xml"
|
||||
encoding="utf-8"
|
||||
media-type="application/xml"
|
||||
/>
|
||||
|
||||
<xsl:template match='/'>
|
||||
<xsl:variable name="query" select="response/lst[@name='responseHeader']/lst[@name='params']/str[@name='q']"/>
|
||||
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||
<title>Example Solr Atom 1.0 Feed</title>
|
||||
<subtitle>
|
||||
This has been formatted by the sample "example_atom.xsl" transform -
|
||||
use your own XSLT to get a nicer Atom feed.
|
||||
</subtitle>
|
||||
<author>
|
||||
<name>Apache Solr</name>
|
||||
<email>solr-user@lucene.apache.org</email>
|
||||
</author>
|
||||
<link rel="self" type="application/atom+xml"
|
||||
href="http://localhost:8983/solr/q={$query}&wt=xslt&tr=atom.xsl"/>
|
||||
<updated>
|
||||
<xsl:value-of select="response/result/doc[position()=1]/date[@name='timestamp']"/>
|
||||
</updated>
|
||||
<id>tag:localhost,2007:example</id>
|
||||
<xsl:apply-templates select="response/result/doc"/>
|
||||
</feed>
|
||||
</xsl:template>
|
||||
|
||||
<!-- search results xslt -->
|
||||
<xsl:template match="doc">
|
||||
<xsl:variable name="id" select="str[@name='id']"/>
|
||||
<entry>
|
||||
<title><xsl:value-of select="str[@name='name']"/></title>
|
||||
<link href="http://localhost:8983/solr/select?q={$id}"/>
|
||||
<id>tag:localhost,2007:<xsl:value-of select="$id"/></id>
|
||||
<summary><xsl:value-of select="arr[@name='features']"/></summary>
|
||||
<updated><xsl:value-of select="date[@name='timestamp']"/></updated>
|
||||
</entry>
|
||||
</xsl:template>
|
||||
|
||||
</xsl:stylesheet>
|
||||
@@ -1,66 +0,0 @@
|
||||
<?xml version='1.0' encoding='UTF-8'?>
|
||||
|
||||
<!--
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
-->
|
||||
|
||||
<!--
|
||||
Simple transform of Solr query results to RSS
|
||||
-->
|
||||
|
||||
<xsl:stylesheet version='1.0'
|
||||
xmlns:xsl='http://www.w3.org/1999/XSL/Transform'>
|
||||
|
||||
<xsl:output
|
||||
method="xml"
|
||||
encoding="utf-8"
|
||||
media-type="application/xml"
|
||||
/>
|
||||
<xsl:template match='/'>
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<title>Example Solr RSS 2.0 Feed</title>
|
||||
<link>http://localhost:8983/solr</link>
|
||||
<description>
|
||||
This has been formatted by the sample "example_rss.xsl" transform -
|
||||
use your own XSLT to get a nicer RSS feed.
|
||||
</description>
|
||||
<language>en-us</language>
|
||||
<docs>http://localhost:8983/solr</docs>
|
||||
<xsl:apply-templates select="response/result/doc"/>
|
||||
</channel>
|
||||
</rss>
|
||||
</xsl:template>
|
||||
|
||||
<!-- search results xslt -->
|
||||
<xsl:template match="doc">
|
||||
<xsl:variable name="id" select="str[@name='id']"/>
|
||||
<xsl:variable name="timestamp" select="date[@name='timestamp']"/>
|
||||
<item>
|
||||
<title><xsl:value-of select="str[@name='name']"/></title>
|
||||
<link>
|
||||
http://localhost:8983/solr/select?q=id:<xsl:value-of select="$id"/>
|
||||
</link>
|
||||
<description>
|
||||
<xsl:value-of select="arr[@name='features']"/>
|
||||
</description>
|
||||
<pubDate><xsl:value-of select="$timestamp"/></pubDate>
|
||||
<guid>
|
||||
http://localhost:8983/solr/select?q=id:<xsl:value-of select="$id"/>
|
||||
</guid>
|
||||
</item>
|
||||
</xsl:template>
|
||||
</xsl:stylesheet>
|
||||
@@ -1,337 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
|
||||
<!--
|
||||
Display the luke request handler with graphs
|
||||
-->
|
||||
<xsl:stylesheet
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
||||
xmlns="http://www.w3.org/1999/xhtml"
|
||||
version="1.0"
|
||||
>
|
||||
<xsl:output
|
||||
method="html"
|
||||
encoding="UTF-8"
|
||||
media-type="text/html"
|
||||
doctype-public="-//W3C//DTD XHTML 1.0 Strict//EN"
|
||||
doctype-system="http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
|
||||
/>
|
||||
|
||||
<xsl:variable name="title">Solr Luke Request Handler Response</xsl:variable>
|
||||
|
||||
<xsl:template match="/">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<link rel="stylesheet" type="text/css" href="solr-admin.css"/>
|
||||
<link rel="icon" href="favicon.ico" type="image/ico"/>
|
||||
<link rel="shortcut icon" href="favicon.ico" type="image/ico"/>
|
||||
<title>
|
||||
<xsl:value-of select="$title"/>
|
||||
</title>
|
||||
<xsl:call-template name="css"/>
|
||||
|
||||
</head>
|
||||
<body>
|
||||
<h1>
|
||||
<xsl:value-of select="$title"/>
|
||||
</h1>
|
||||
<div class="doc">
|
||||
<ul>
|
||||
<xsl:if test="response/lst[@name='index']">
|
||||
<li>
|
||||
<a href="#index">Index Statistics</a>
|
||||
</li>
|
||||
</xsl:if>
|
||||
<xsl:if test="response/lst[@name='fields']">
|
||||
<li>
|
||||
<a href="#fields">Field Statistics</a>
|
||||
<ul>
|
||||
<xsl:for-each select="response/lst[@name='fields']/lst">
|
||||
<li>
|
||||
<a href="#{@name}">
|
||||
<xsl:value-of select="@name"/>
|
||||
</a>
|
||||
</li>
|
||||
</xsl:for-each>
|
||||
</ul>
|
||||
</li>
|
||||
</xsl:if>
|
||||
<xsl:if test="response/lst[@name='doc']">
|
||||
<li>
|
||||
<a href="#doc">Document statistics</a>
|
||||
</li>
|
||||
</xsl:if>
|
||||
</ul>
|
||||
</div>
|
||||
<xsl:if test="response/lst[@name='index']">
|
||||
<h2><a name="index"/>Index Statistics</h2>
|
||||
<xsl:apply-templates select="response/lst[@name='index']"/>
|
||||
</xsl:if>
|
||||
<xsl:if test="response/lst[@name='fields']">
|
||||
<h2><a name="fields"/>Field Statistics</h2>
|
||||
<xsl:apply-templates select="response/lst[@name='fields']"/>
|
||||
</xsl:if>
|
||||
<xsl:if test="response/lst[@name='doc']">
|
||||
<h2><a name="doc"/>Document statistics</h2>
|
||||
<xsl:apply-templates select="response/lst[@name='doc']"/>
|
||||
</xsl:if>
|
||||
</body>
|
||||
</html>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="lst">
|
||||
<xsl:if test="parent::lst">
|
||||
<tr>
|
||||
<td colspan="2">
|
||||
<div class="doc">
|
||||
<xsl:call-template name="list"/>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
</xsl:if>
|
||||
<xsl:if test="not(parent::lst)">
|
||||
<div class="doc">
|
||||
<xsl:call-template name="list"/>
|
||||
</div>
|
||||
</xsl:if>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template name="list">
|
||||
<xsl:if test="count(child::*)>0">
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th colspan="2">
|
||||
<p>
|
||||
<a name="{@name}"/>
|
||||
</p>
|
||||
<xsl:value-of select="@name"/>
|
||||
</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<xsl:choose>
|
||||
<xsl:when
|
||||
test="@name='histogram'">
|
||||
<tr>
|
||||
<td colspan="2">
|
||||
<xsl:call-template name="histogram"/>
|
||||
</td>
|
||||
</tr>
|
||||
</xsl:when>
|
||||
<xsl:otherwise>
|
||||
<xsl:apply-templates/>
|
||||
</xsl:otherwise>
|
||||
</xsl:choose>
|
||||
</tbody>
|
||||
</table>
|
||||
</xsl:if>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template name="histogram">
|
||||
<div class="doc">
|
||||
<xsl:call-template name="barchart">
|
||||
<xsl:with-param name="max_bar_width">50</xsl:with-param>
|
||||
<xsl:with-param name="iwidth">800</xsl:with-param>
|
||||
<xsl:with-param name="iheight">160</xsl:with-param>
|
||||
<xsl:with-param name="fill">blue</xsl:with-param>
|
||||
</xsl:call-template>
|
||||
</div>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template name="barchart">
|
||||
<xsl:param name="max_bar_width"/>
|
||||
<xsl:param name="iwidth"/>
|
||||
<xsl:param name="iheight"/>
|
||||
<xsl:param name="fill"/>
|
||||
<xsl:variable name="max">
|
||||
<xsl:for-each select="int">
|
||||
<xsl:sort data-type="number" order="descending"/>
|
||||
<xsl:if test="position()=1">
|
||||
<xsl:value-of select="."/>
|
||||
</xsl:if>
|
||||
</xsl:for-each>
|
||||
</xsl:variable>
|
||||
<xsl:variable name="bars">
|
||||
<xsl:value-of select="count(int)"/>
|
||||
</xsl:variable>
|
||||
<xsl:variable name="bar_width">
|
||||
<xsl:choose>
|
||||
<xsl:when test="$max_bar_width < ($iwidth div $bars)">
|
||||
<xsl:value-of select="$max_bar_width"/>
|
||||
</xsl:when>
|
||||
<xsl:otherwise>
|
||||
<xsl:value-of select="$iwidth div $bars"/>
|
||||
</xsl:otherwise>
|
||||
</xsl:choose>
|
||||
</xsl:variable>
|
||||
<table class="histogram">
|
||||
<tbody>
|
||||
<tr>
|
||||
<xsl:for-each select="int">
|
||||
<td>
|
||||
<xsl:value-of select="."/>
|
||||
<div class="histogram">
|
||||
<xsl:attribute name="style">background-color: <xsl:value-of select="$fill"/>; width: <xsl:value-of select="$bar_width"/>px; height: <xsl:value-of select="($iheight*number(.)) div $max"/>px;</xsl:attribute>
|
||||
</div>
|
||||
</td>
|
||||
</xsl:for-each>
|
||||
</tr>
|
||||
<tr>
|
||||
<xsl:for-each select="int">
|
||||
<td>
|
||||
<xsl:value-of select="@name"/>
|
||||
</td>
|
||||
</xsl:for-each>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template name="keyvalue">
|
||||
<xsl:choose>
|
||||
<xsl:when test="@name">
|
||||
<tr>
|
||||
<td class="name">
|
||||
<xsl:value-of select="@name"/>
|
||||
</td>
|
||||
<td class="value">
|
||||
<xsl:value-of select="."/>
|
||||
</td>
|
||||
</tr>
|
||||
</xsl:when>
|
||||
<xsl:otherwise>
|
||||
<xsl:value-of select="."/>
|
||||
</xsl:otherwise>
|
||||
</xsl:choose>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="int|bool|long|float|double|uuid|date">
|
||||
<xsl:call-template name="keyvalue"/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="arr">
|
||||
<tr>
|
||||
<td class="name">
|
||||
<xsl:value-of select="@name"/>
|
||||
</td>
|
||||
<td class="value">
|
||||
<ul>
|
||||
<xsl:for-each select="child::*">
|
||||
<li>
|
||||
<xsl:apply-templates/>
|
||||
</li>
|
||||
</xsl:for-each>
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="str">
|
||||
<xsl:choose>
|
||||
<xsl:when test="@name='schema' or @name='index' or @name='flags'">
|
||||
<xsl:call-template name="schema"/>
|
||||
</xsl:when>
|
||||
<xsl:otherwise>
|
||||
<xsl:call-template name="keyvalue"/>
|
||||
</xsl:otherwise>
|
||||
</xsl:choose>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template name="schema">
|
||||
<tr>
|
||||
<td class="name">
|
||||
<xsl:value-of select="@name"/>
|
||||
</td>
|
||||
<td class="value">
|
||||
<xsl:if test="contains(.,'unstored')">
|
||||
<xsl:value-of select="."/>
|
||||
</xsl:if>
|
||||
<xsl:if test="not(contains(.,'unstored'))">
|
||||
<xsl:call-template name="infochar2string">
|
||||
<xsl:with-param name="charList">
|
||||
<xsl:value-of select="."/>
|
||||
</xsl:with-param>
|
||||
</xsl:call-template>
|
||||
</xsl:if>
|
||||
</td>
|
||||
</tr>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template name="infochar2string">
|
||||
<xsl:param name="i">1</xsl:param>
|
||||
<xsl:param name="charList"/>
|
||||
|
||||
<xsl:variable name="char">
|
||||
<xsl:value-of select="substring($charList,$i,1)"/>
|
||||
</xsl:variable>
|
||||
<xsl:choose>
|
||||
<xsl:when test="$char='I'">
|
||||
<xsl:value-of select="/response/lst[@name='info']/lst/str[@name='I']"/> - </xsl:when>
|
||||
<xsl:when test="$char='T'">
|
||||
<xsl:value-of select="/response/lst[@name='info']/lst/str[@name='T']"/> - </xsl:when>
|
||||
<xsl:when test="$char='S'">
|
||||
<xsl:value-of select="/response/lst[@name='info']/lst/str[@name='S']"/> - </xsl:when>
|
||||
<xsl:when test="$char='M'">
|
||||
<xsl:value-of select="/response/lst[@name='info']/lst/str[@name='M']"/> - </xsl:when>
|
||||
<xsl:when test="$char='V'">
|
||||
<xsl:value-of select="/response/lst[@name='info']/lst/str[@name='V']"/> - </xsl:when>
|
||||
<xsl:when test="$char='o'">
|
||||
<xsl:value-of select="/response/lst[@name='info']/lst/str[@name='o']"/> - </xsl:when>
|
||||
<xsl:when test="$char='p'">
|
||||
<xsl:value-of select="/response/lst[@name='info']/lst/str[@name='p']"/> - </xsl:when>
|
||||
<xsl:when test="$char='O'">
|
||||
<xsl:value-of select="/response/lst[@name='info']/lst/str[@name='O']"/> - </xsl:when>
|
||||
<xsl:when test="$char='L'">
|
||||
<xsl:value-of select="/response/lst[@name='info']/lst/str[@name='L']"/> - </xsl:when>
|
||||
<xsl:when test="$char='B'">
|
||||
<xsl:value-of select="/response/lst[@name='info']/lst/str[@name='B']"/> - </xsl:when>
|
||||
<xsl:when test="$char='C'">
|
||||
<xsl:value-of select="/response/lst[@name='info']/lst/str[@name='C']"/> - </xsl:when>
|
||||
<xsl:when test="$char='f'">
|
||||
<xsl:value-of select="/response/lst[@name='info']/lst/str[@name='f']"/> - </xsl:when>
|
||||
<xsl:when test="$char='l'">
|
||||
<xsl:value-of select="/response/lst[@name='info']/lst/str[@name='l']"/> -
|
||||
</xsl:when>
|
||||
</xsl:choose>
|
||||
|
||||
<xsl:if test="not($i>=string-length($charList))">
|
||||
<xsl:call-template name="infochar2string">
|
||||
<xsl:with-param name="i">
|
||||
<xsl:value-of select="$i+1"/>
|
||||
</xsl:with-param>
|
||||
<xsl:with-param name="charList">
|
||||
<xsl:value-of select="$charList"/>
|
||||
</xsl:with-param>
|
||||
</xsl:call-template>
|
||||
</xsl:if>
|
||||
</xsl:template>
|
||||
<xsl:template name="css">
|
||||
<style type="text/css">
|
||||
<![CDATA[
|
||||
td.name {font-style: italic; font-size:80%; }
|
||||
.doc { margin: 0.5em; border: solid grey 1px; }
|
||||
.exp { display: none; font-family: monospace; white-space: pre; }
|
||||
div.histogram { background: none repeat scroll 0%; -moz-background-clip: -moz-initial; -moz-background-origin: -moz-initial; -moz-background-inline-policy: -moz-initial;}
|
||||
table.histogram { width: auto; vertical-align: bottom; }
|
||||
table.histogram td, table.histogram th { text-align: center; vertical-align: bottom; border-bottom: 1px solid #ff9933; width: auto; }
|
||||
]]>
|
||||
</style>
|
||||
</xsl:template>
|
||||
</xsl:stylesheet>
|
||||
@@ -1,70 +0,0 @@
|
||||
<!--
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
-->
|
||||
|
||||
<!--
|
||||
Simple transform of Solr query response into Solr Update XML compliant XML.
|
||||
When used in the xslt response writer you will get UpdaateXML as output.
|
||||
But you can also store a query response XML to disk and feed this XML to
|
||||
the XSLTUpdateRequestHandler to index the content. Provided as example only.
|
||||
See http://wiki.apache.org/solr/XsltUpdateRequestHandler for more info
|
||||
-->
|
||||
<xsl:stylesheet version='1.0' xmlns:xsl='http://www.w3.org/1999/XSL/Transform'>
|
||||
<xsl:output media-type="text/xml" method="xml" indent="yes"/>
|
||||
|
||||
<xsl:template match='/'>
|
||||
<add>
|
||||
<xsl:apply-templates select="response/result/doc"/>
|
||||
</add>
|
||||
</xsl:template>
|
||||
|
||||
<!-- Ignore score (makes no sense to index) -->
|
||||
<xsl:template match="doc/*[@name='score']" priority="100">
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="doc">
|
||||
<xsl:variable name="pos" select="position()"/>
|
||||
<doc>
|
||||
<xsl:apply-templates>
|
||||
<xsl:with-param name="pos"><xsl:value-of select="$pos"/></xsl:with-param>
|
||||
</xsl:apply-templates>
|
||||
</doc>
|
||||
</xsl:template>
|
||||
|
||||
<!-- Flatten arrays to duplicate field lines -->
|
||||
<xsl:template match="doc/arr" priority="100">
|
||||
<xsl:variable name="fn" select="@name"/>
|
||||
|
||||
<xsl:for-each select="*">
|
||||
<xsl:element name="field">
|
||||
<xsl:attribute name="name"><xsl:value-of select="$fn"/></xsl:attribute>
|
||||
<xsl:value-of select="."/>
|
||||
</xsl:element>
|
||||
</xsl:for-each>
|
||||
</xsl:template>
|
||||
|
||||
|
||||
<xsl:template match="doc/*">
|
||||
<xsl:variable name="fn" select="@name"/>
|
||||
|
||||
<xsl:element name="field">
|
||||
<xsl:attribute name="name"><xsl:value-of select="$fn"/></xsl:attribute>
|
||||
<xsl:value-of select="."/>
|
||||
</xsl:element>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="*"/>
|
||||
</xsl:stylesheet>
|
||||
@@ -1 +0,0 @@
|
||||
name=stackdump-comments
|
||||
@@ -110,6 +110,10 @@
|
||||
|
||||
<!-- we'll get the values out of the JSON, so most fields are not stored -->
|
||||
<!-- fields are listed here so searches can be performed against them -->
|
||||
<!-- this is used by Lucene to uniquely identify a post across all sites.
|
||||
It is of the form "siteKey-id" and is necessary because post IDs are
|
||||
reused across sites. -->
|
||||
<field name="documentId" type="string" indexed="true" stored="true" required="true" />
|
||||
<!-- the ID field needs to be a string for the QueryElevationComponent -->
|
||||
<field name="id" type="string" indexed="true" stored="true" required="true" />
|
||||
<field name="siteKey" type="string" indexed="true" stored="true" required="true" />
|
||||
@@ -196,7 +200,7 @@
|
||||
<!-- Field to use to determine and enforce document uniqueness.
|
||||
Unless this field is marked with required="false", it will be a required field
|
||||
-->
|
||||
<uniqueKey>id</uniqueKey>
|
||||
<uniqueKey>documentId</uniqueKey>
|
||||
|
||||
<!-- DEPRECATED: The defaultSearchField is consulted by various query parsers when
|
||||
parsing a query string that isn't explicit about the field. Machine (non-user)
|
||||
|
||||
@@ -35,5 +35,5 @@ else
|
||||
# shift off the command name so we don't pass it on
|
||||
shift
|
||||
|
||||
$SCRIPT_DIR/start_python.sh $command "$@"
|
||||
$SCRIPT_DIR/start_python3.sh $command "$@"
|
||||
fi
|
||||
|
||||
1058
python/packages/pysolr.py.orig
Normal file
1058
python/packages/pysolr.py.orig
Normal file
File diff suppressed because it is too large
Load Diff
@@ -410,7 +410,8 @@ def view_question(site_key, question_id, answer_id=None):
|
||||
|
||||
result = results.docs[0]
|
||||
convert_comments_to_html(result)
|
||||
rewrite_result(result)
|
||||
if settings.REWRITE_LINKS_AND_IMAGES:
|
||||
rewrite_result(result)
|
||||
sort_answers(result)
|
||||
context['result'] = result
|
||||
|
||||
@@ -826,27 +827,31 @@ def _rewrite_html(html, app_url_root, sites_by_urls):
|
||||
internal_link = False
|
||||
url = t.get('href', None)
|
||||
if url:
|
||||
host = urllib2.Request(url).get_host()
|
||||
site = sites_by_urls.get(host, None)
|
||||
if site:
|
||||
# rewrite this URL for stackdump
|
||||
question_id = SE_QUESTION_ID_RE.search(url)
|
||||
if question_id:
|
||||
question_id = question_id.groupdict()['id']
|
||||
url = '%s%s/%s' % (app_url_root, site.key, question_id)
|
||||
t.set('href', url)
|
||||
t.set('class', t.get('class', '') + ' internal-link')
|
||||
internal_link = True
|
||||
|
||||
answer_id = SE_ANSWER_ID_RE.search(url)
|
||||
if answer_id:
|
||||
answer_id = answer_id.groupdict()['id']
|
||||
url = '%s%s/a/%s' % (app_url_root, site.key, answer_id)
|
||||
t.set('href', url)
|
||||
t.set('class', t.get('class', '') + ' internal-link')
|
||||
internal_link = True
|
||||
try:
|
||||
host = urllib2.Request(url).get_host()
|
||||
except ValueError:
|
||||
# invalid URL or local anchor, leaving as-is
|
||||
pass
|
||||
else:
|
||||
site = sites_by_urls.get(host, None)
|
||||
if site:
|
||||
# rewrite this URL for stackdump
|
||||
question_id = SE_QUESTION_ID_RE.search(url)
|
||||
if question_id:
|
||||
question_id = question_id.groupdict()['id']
|
||||
url = '%s%s/%s' % (app_url_root, site.key, question_id)
|
||||
t.set('href', url)
|
||||
internal_link = True
|
||||
answer_id = SE_ANSWER_ID_RE.search(url)
|
||||
if answer_id:
|
||||
answer_id = answer_id.groupdict()['id']
|
||||
url = '%s%s/a/%s' % (app_url_root, site.key, answer_id)
|
||||
t.set('href', url)
|
||||
internal_link = True
|
||||
|
||||
if not internal_link:
|
||||
if internal_link:
|
||||
t.set('class', t.get('class', '') + ' internal-link')
|
||||
else:
|
||||
t.set('class', t.get('class', '') + ' external-link')
|
||||
|
||||
# get a string back
|
||||
|
||||
1
python/src/stackdump/commands/default_settings.py
Symbolic link
1
python/src/stackdump/commands/default_settings.py
Symbolic link
@@ -0,0 +1 @@
|
||||
../default_settings.py
|
||||
@@ -1,47 +1,142 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# This script downloads the sites RSS file and associated logos from the net.
|
||||
|
||||
import urllib
|
||||
import tarfile
|
||||
import urllib.request
|
||||
from xml.etree import ElementTree
|
||||
import os
|
||||
import sys
|
||||
def printf(format, *args):
|
||||
sys.stdout.write(format % args)
|
||||
from shutil import copy
|
||||
import os, ssl, fnmatch
|
||||
from optparse import OptionParser
|
||||
from xml.etree import ElementTree
|
||||
import elasticsearch
|
||||
|
||||
import settings
|
||||
from sqlobject import sqlhub, connectionForURI,AND, IN, SQLObject, \
|
||||
UnicodeCol, DateTimeCol, IntCol, DatabaseIndex, dbconnection
|
||||
from sqlobject.sqlbuilder import Delete, Insert
|
||||
from sqlobject.styles import DefaultStyle
|
||||
from pysolr import Solr, SolrError
|
||||
|
||||
if (not os.environ.get('PYTHONHTTPSVERIFY', '') and
|
||||
getattr(ssl, '_create_unverified_context', None)):
|
||||
ssl._create_defat_https_context = ssl._create_unverified_context
|
||||
|
||||
se_dir = os.path.join(os.environ.get('HOME'), 'stackexchange')
|
||||
sites_path = os.path.join(se_dir, 'Sites.xml')
|
||||
|
||||
script_dir = os.path.dirname(sys.argv[0])
|
||||
sites_file_path = os.path.join(script_dir, '../../../../data/sites')
|
||||
|
||||
sites_file_path = os.path.join(script_dir, ''
|
||||
'../../../../data/')
|
||||
# ensure the data directory exists
|
||||
# download the sites RSS file
|
||||
|
||||
if not os.path.exists(os.path.dirname(sites_file_path)):
|
||||
os.mkdir(os.path.dirname(sites_file_path))
|
||||
|
||||
# download the sites RSS file
|
||||
print 'Downloading StackExchange sites RSS file...',
|
||||
urllib.urlretrieve('http://stackexchange.com/feeds/sites', sites_file_path)
|
||||
print 'done.'
|
||||
print('Downloading StackExchange sites XML file...')
|
||||
# urllib.request.urlretrieve('https://archive.org/download/stackexchange/Sites.xml', sites_file_path)
|
||||
print('done.')
|
||||
|
||||
print ''
|
||||
print('')
|
||||
|
||||
# parse sites RSS file and download logos
|
||||
logos_dir_path = os.path.join(script_dir, '../../../media/images/logos')
|
||||
|
||||
|
||||
# parse sites RSS file and download logosc
|
||||
images_dir_path = os.path.join(script_dir, '../../../media/images')
|
||||
print(os.listdir(images_dir_path))
|
||||
logos_dir_path = os.path.join(images_dir_path, 'logos48')
|
||||
if not os.path.exists(logos_dir_path):
|
||||
os.mkdir(logos_dir_path)
|
||||
icons_dir_path = os.path.join(images_dir_path, 'icons')
|
||||
if not os.path.exists(icons_dir_path):
|
||||
os.mkdir(icons_dir_path)
|
||||
badges_dir_path = os.path.join(images_dir_path, 'badges')
|
||||
if not os.path.exists(badges_dir_path):
|
||||
os.mkdir(badges_dir_path)
|
||||
|
||||
with open(sites_file_path) as f:
|
||||
with open(sites_path) as f:
|
||||
sites_file = ElementTree.parse(f)
|
||||
entries = sites_file.findall('{http://www.w3.org/2005/Atom}entry')
|
||||
|
||||
for entry in entries:
|
||||
entry_title = entry.find('{http://www.w3.org/2005/Atom}title').text.encode('ascii', 'ignore')
|
||||
|
||||
sites = sites_file.findall('row')
|
||||
# print(rows[0].attrib)
|
||||
|
||||
for site in sites:
|
||||
site_title = site.attrib['LongName']
|
||||
site_name = site.attrib['Name']
|
||||
# extract the key from the url - remove the http:// and .com
|
||||
site_key = entry.find('{http://www.w3.org/2005/Atom}id').text
|
||||
if site_key.startswith('http://'):
|
||||
site_key = site_key[len('http://'):]
|
||||
if site_key.endswith('.com'):
|
||||
site_key = site_key[:-len('.com')]
|
||||
if site_key.endswith('.stackexchange'):
|
||||
site_key = site_key[:-len('.stackexchange')]
|
||||
|
||||
print 'Downloading logo for %s...' % entry_title,
|
||||
urllib.urlretrieve('http://sstatic.net/%s/img/icon-48.png' % site_key, os.path.join(logos_dir_path, '%s.png' % site_key))
|
||||
print 'done.'
|
||||
site_key = site.attrib['TinyName']
|
||||
site_url = site.attrib['Url'][8:]
|
||||
logo_url = site.attrib['ImageUrl']
|
||||
icon_url = site.attrib['IconUrl']
|
||||
badge_url = site.attrib['BadgeIconUrl']
|
||||
|
||||
|
||||
site_vars = (site_url, site_key, site_name, site_title)
|
||||
# print(site_vars)
|
||||
printf('Site: %s, key=%s, name="%s", longname="%s"\n' % site_vars)
|
||||
try:
|
||||
logo_file = os.path.join(logos_dir_path, 'logo-%s.png' % site_key)
|
||||
if not os.path.exists(logo_file):
|
||||
print('Downloading logo for %s...' % site_title, urllib.request.urlretrieve(logo_url, logo_file))
|
||||
except Exception as e:
|
||||
print('Failed download logo for %s...' % site_title, str(e))
|
||||
|
||||
try:
|
||||
icon_path = os.path.join(icons_dir_path, 'icon-%s.png' % site_key)
|
||||
if not os.path.exists(icon_path):
|
||||
print('Downloading icon for %s...' % site_title, urllib.request.urlretrieve(icon_url, icon_path))
|
||||
except:
|
||||
print('Failed download ico for %s...' % site_title, icon_url)
|
||||
|
||||
try:
|
||||
badge_file = os.path.join(badges_dir_path, 'badge-%s.png' % site_key)
|
||||
if not os.path.exists(icon_path):
|
||||
print('Downloading badge for %s...' % site_title, urllib.request.urlretrieve(badge_url, badge_file))
|
||||
except:
|
||||
printf('Failed download badge for %s...' % site_title)
|
||||
|
||||
site_files = []
|
||||
print('Key: ' + site_url)
|
||||
for root, dirs, files in os.walk(se_dir):
|
||||
for name in files:
|
||||
if fnmatch.fnmatch(name, site_url + '*'):
|
||||
print('Match: ' + os.path.join(root, name))
|
||||
site_files.append(os.path.join(root, name))
|
||||
|
||||
sites_data = sites_file_path
|
||||
for site_file in site_files:
|
||||
dst = sites_data + os.sep + site_key[0] + os.sep + site_key + os.sep + 'xml'
|
||||
os.makedirs(dst, exist_ok=True)
|
||||
os.chdir(dst)
|
||||
os.system('tar xzf '+site_file)
|
||||
print('Data: ' + site_file)
|
||||
|
||||
def prepare_site(xml_root, dump_date, site_key):
|
||||
print('Using the XML root path: ' + xml_root + '\n')
|
||||
|
||||
if not os.path.exists(xml_root):
|
||||
print('The given XML root path does not exist.')
|
||||
sys.exit(1)
|
||||
|
||||
# connect to the database
|
||||
print('Connecting to the Stackdump database...')
|
||||
conn_str = settings.DATABASE_CONN_STR
|
||||
sqlhub.processConnection = connectionForURI(conn_str)
|
||||
print('Connected.\n')
|
||||
|
||||
# MAIN METHOD
|
||||
if __name__ == '__main__':
|
||||
parser = OptionParser(usage='usage: %pro'
|
||||
'g [options] xml_root_dir')
|
||||
parser.add_option('-k', '--site-key', help='Key of the site (if not in sites).')
|
||||
parser.add_option('-c', '--dump-date', help='Dump date of the site.')
|
||||
|
||||
(cmd_options, cmd_args) = parser.parse_args()
|
||||
|
||||
if len(cmd_args) < 1:
|
||||
print('The path to the directory containing the extracted XML files is required.')
|
||||
sys.exit(1)
|
||||
|
||||
prepare_site(cmd_args[0], cmd_options.dump_date, cmd_options.site_key)
|
||||
|
||||
911
python/src/stackdump/commands/import_recent.py
Normal file
911
python/src/stackdump/commands/import_recent.py
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
1
python/src/stackdump/commands/settings.py
Symbolic link
1
python/src/stackdump/commands/settings.py
Symbolic link
@@ -0,0 +1 @@
|
||||
../settings.py
|
||||
@@ -16,10 +16,10 @@ SERVER_HOST = '0.0.0.0'
|
||||
SERVER_PORT = 8080
|
||||
|
||||
SOLR_URL = 'http://localhost:8983/solr/stackdump/'
|
||||
SOLR_COMMENTS_URL = 'http://localhost:8983/solr/stackdump-comments/'
|
||||
|
||||
import os
|
||||
DATABASE_CONN_STR = 'sqlite:///' + os.path.join(os.path.dirname(__file__), '..', '..', '..', 'data', 'stackdump.sqlite')
|
||||
TEMP_COMMENTS_DATABASE_DIR = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'data')
|
||||
|
||||
# if the website is hosted under a subpath, specify it here. It must end with a
|
||||
# slash.
|
||||
@@ -32,6 +32,9 @@ NUM_OF_DEFAULT_COMMENTS = 3
|
||||
# number of random questions to show on search query pages
|
||||
NUM_OF_RANDOM_QUESTIONS = 3
|
||||
|
||||
# rewrite links and images to point internally or to a placeholder respectively
|
||||
REWRITE_LINKS_AND_IMAGES = True
|
||||
|
||||
# settings that are available in templates
|
||||
TEMPLATE_SETTINGS = [
|
||||
'APP_URL_ROOT',
|
||||
|
||||
@@ -34,3 +34,6 @@ from default_settings import *
|
||||
|
||||
# number of random questions to show on search query pages
|
||||
#NUM_OF_RANDOM_QUESTIONS = 3
|
||||
|
||||
# rewrite links and images to point internally or to a placeholder respectively
|
||||
#REWRITE_LINKS_AND_IMAGES = True
|
||||
|
||||
1
questions (1).json
Normal file
1
questions (1).json
Normal file
File diff suppressed because one or more lines are too long
1
questions.json
Normal file
1
questions.json
Normal file
File diff suppressed because one or more lines are too long
BIN
schema.xlsx
Normal file
BIN
schema.xlsx
Normal file
Binary file not shown.
54
start_python3.sh
Executable file
54
start_python3.sh
Executable file
@@ -0,0 +1,54 @@
|
||||
#!/bin/bash
|
||||
|
||||
##
|
||||
# This script attempts to find a version of Python on the system PATH, and
|
||||
# checks that it is 2.5+.
|
||||
#
|
||||
# A alternate Python command can be specified in a file named PYTHON_CMD in this
|
||||
# script's directory. This path will override any lookup on the system PATH.
|
||||
##
|
||||
|
||||
# FUNCTIONS
|
||||
function checkPythonVersion {
|
||||
if [ ! -z "$1" ]
|
||||
then
|
||||
PYTHON_VER_MAJOR=`echo $1 | cut -d "." -f 1`
|
||||
PYTHON_VER_MINOR=`echo $1 | cut -d "." -f 2`
|
||||
|
||||
if [ $PYTHON_VER_MAJOR -eq "3" -a $PYTHON_VER_MINOR -ge "5" ]
|
||||
then
|
||||
return 1
|
||||
fi
|
||||
fi
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
# MAIN
|
||||
SCRIPT_DIR=`dirname $0`
|
||||
PYTHON_CMD=python3
|
||||
|
||||
# if there is a PYTHON_CMD file in the script directory, use that instead
|
||||
if [ -e "$SCRIPT_DIR/PYTHON_CMD" ]
|
||||
then
|
||||
PYTHON_CMD=`cat "$SCRIPT_DIR/PYTHON_CMD"`
|
||||
fi
|
||||
|
||||
if [ ! -z "`which "$PYTHON_CMD" 2>/dev/null`" ]
|
||||
then
|
||||
# check if Python is the right version
|
||||
PYTHON_VER=`"$PYTHON_CMD" -V 2>&1 | cut -d " " -f 2`
|
||||
checkPythonVersion "$PYTHON_VER"
|
||||
if [ $? == 1 ]
|
||||
then
|
||||
echo "Using Python `which "$PYTHON_CMD"`"
|
||||
|
||||
# execution ends here if Python is found
|
||||
PYTHONPATH=$SCRIPT_DIR/pyth3/packages:$SCRIPT_DIR/python/src:$PYTHONPATH
|
||||
env "PYTHONPATH=$PYTHONPATH" "$PYTHON_CMD" "$@"
|
||||
exit $?
|
||||
fi
|
||||
fi
|
||||
|
||||
# if we get here, it means the right version of Python was not found
|
||||
echo 'No suitable version of Python was found. Python 2.5 or later is required.'
|
||||
Reference in New Issue
Block a user