UsenetSearch/usenetsearch.example.conf

81 lines
3.0 KiB
Plaintext
Raw Normal View History

#####################################
# NNTP server configuration details #
#####################################
nntp_server_host: my.new.server.example.com
nntp_server_port: 119
nntp_server_user: someuser
nntp_server_pass: changeme
nntp_server_use_ssl: no
#################
# Path settings #
#################
# database location (relative or absolute path)
database_path: ./db
####################
# Storage settings #
####################
# A higher tree depth creates more search tokens, so it improves the speed and
# likelyhood of finding search results, at the cost of extra storage
# requirements, more files, slower indexing.
max_tree_depth: 10
################################
# Parallel processing settings #
################################
# If you're processing headers faster than you can pull them down over the
# network, you're likely not going to need more than 1 or 2 threads, but
# otherwise, more threads can help. The batch size should be large enough such
# that all configured threads have enough work to do.
#
# The higher your tree max_tree_depth, the more likely you'll need to increase
# this.
max_threads: 8
batch_size: 1000
########################
# Word filter settings #
########################
# It's important to filter out commonly used words to avoid blowing up an index
# in size. Huge indexes are going to eat a lot of disk space and slow down
# searches.
# This setting lists all substrings that should be erased from subjects and
# search strings before they are tokenized. For instance, you might not want to
# store all results for the word "the", or "in" and other stopwords.
# List of strings is comma-separated and case-insensitive. Each subsequent
# option appends to the previously defined list.
filter_erase_subtoken: a,about,actually,almost,also,although,always,am,an,and
filter_erase_subtoken: any,are,as,at,be,became,become,but,by,can,could,did,do
filter_erase_subtoken: does,each,either,else,for,from,had,has,have,hence,how
filter_erase_subtoken: i,if,in,is,it,its,just,may,maybe,me,might,mine,must,my
filter_erase_subtoken: mine,must,my,neither,nor,not,of,oh,ok,the,to,when,where
filter_erase_subtoken: whereas,wherever,whenever,whether,which,while,who,whom
filter_erase_subtoken: whoever,whose,why,will,with,within,without,would,yes
filter_erase_subtoken: yet,you,your
# This setting lets you list all tokens that will only be indexed on direct
# (whole string) matches. Each token is comma-separated, and the configuration
# option may be listed multiple times as well, each subsequent option appends to
# the previously defined list. All tokens are case-insensitive.
filter_no_subtoken: makes for,funny business
# Sets the minimum number of words in a sub-token. You may use this if you don't
# want to index single-words unless they are a direct match to the subject (in
# which case, you'd set this to a minimum of 2 words) - or you may even want a
# higher minimum than that if you're really wanting to optimize search speed and
# disk usage.
minimum_subtoken_words: 2