UsenetSearch/usenetsearch.example.conf

103 lines
3.8 KiB
Plaintext

#####################################
# NNTP server configuration details #
#####################################
nntp_server_host: my.new.server.example.com
nntp_server_port: 119
nntp_server_user: someuser
nntp_server_pass: changeme
nntp_server_use_ssl: no
#################
# Path settings #
#################
# database location (relative or absolute path)
database_path: ./db
####################
# Storage settings #
####################
# A higher tree depth creates more search tokens, so it improves the speed and
# likelyhood of finding search results, at the cost of extra storage
# requirements, more files, slower indexing.
max_tree_depth: 10
################################
# Parallel processing settings #
################################
# If you're processing headers faster than you can pull them down over the
# network, you're likely not going to need more than 1 or 2 threads, but
# otherwise, more threads can help. The batch size should be large enough such
# that all configured threads have enough work to do.
#
# The higher your tree max_tree_depth, the more likely you'll need to increase
# this.
max_threads: 16
batch_size: 10000
#############################
# Newsgroup filter settings #
#############################
# List one or more newsgroup regular expressions to include or exclude from
# being indexed. Blacklisted patterns take precedence over whitelisted patterns.
# These options may be repeated to include additional blacklist/whitelist
# regular expressions.
# If filter_newsgroup_whitelist is set, only newsgroups matching the configured
# regular expressions will be included in indexing.
# If not set, all of usenet will be indexed (with the exeption of
# filter_newsgroup_blacklist groups)
filter_newsgroup_whitelist: ^alt\.bible$
filter_newsgroup_whitelist: ^borland\.public\.cppbuilder\.*
# filter_newsgroup_blacklist allows you to exclude newsgroups from being
# indexed, whether filter_newsgroup_whitlelist is set or not.
filter_newsgroup_blacklist: .*binaries.*
########################
# Word filter settings #
########################
# It's important to filter out commonly used words to avoid blowing up an index
# in size. Huge indexes are going to eat a lot of disk space and slow down
# searches.
# This setting lists all substrings that should be erased from subjects and
# search strings before they are tokenized. For instance, you might not want to
# store all results for the word "the", or "in" and other stopwords.
# List of strings is comma-separated and case-insensitive. Each subsequent
# option appends to the previously defined list.
# filter_erase_subtoken: the,by
# This setting lets you list all tokens that will only be indexed on direct
# (whole string) matches. Each token is comma-separated, and the configuration
# option may be listed multiple times as well, each subsequent option appends to
# the previously defined list. All tokens are case-insensitive.
filter_no_subtoken: a,about,actually,almost,also,although,always,am,an,and
filter_no_subtoken: any,are,as,at,be,became,become,but,by,can,could,did,do
filter_no_subtoken: does,each,either,else,for,from,had,has,have,hence,how
filter_no_subtoken: i,if,in,is,it,its,just,may,maybe,me,might,mine,must,my
filter_no_subtoken: mine,must,my,neither,nor,not,of,oh,ok,the,to,when,where
filter_no_subtoken: whereas,wherever,whenever,whether,which,while,who,whom
filter_no_subtoken: whoever,whose,why,will,with,within,without,would,yes
filter_no_subtoken: yet,you,your
# Sets the minimum number of words in a sub-token. You may use this if you don't
# want to index single-words unless they are a direct match to the subject (in
# which case, you'd set this to a minimum of 2 words) - or you may even want a
# higher minimum than that if you're really wanting to optimize search speed and
# disk usage.
minimum_subtoken_words: 2