UsenetSearch/usenetsearch.example.conf


#####################################
# NNTP server configuration details #
#####################################

nntp_server_host: my.new.server.example.com
nntp_server_port: 119
nntp_server_user: someuser
nntp_server_pass: changeme
nntp_server_use_ssl: no

#################
# Path settings #
#################

# database location (relative or absolute path)
database_path: ./db

####################
# Storage settings #
####################

# A higher tree depth creates more search tokens, so it improves the speed and
# likelyhood of finding search results, at the cost of extra storage
# requirements, more files, slower indexing.

max_tree_depth: 10

################################
# Parallel processing settings #
################################

# If you're processing headers faster than you can pull them down over the
# network, you're likely not going to need more than 1 or 2 threads, but
# otherwise, more threads can help. The batch size should be large enough such
# that all configured threads have enough work to do.
#
# The higher your tree max_tree_depth, the more likely you'll need to increase
# this.

max_threads: 8
batch_size: 1000

########################
# Word filter settings #
########################

# It's important to filter out commonly used words to avoid blowing up an index
# in size. Huge indexes are going to eat a lot of disk space and slow down
# searches.

# This setting lists all substrings that should be erased from subjects and
# search strings before they are tokenized. For instance, you might not want to 
# store all results for the word "the", or "in" and other stopwords.
# List of strings is comma-separated and case-insensitive. Each subsequent
# option appends to the previously defined list.

filter_erase_subtoken: a,about,actually,almost,also,although,always,am,an,and
filter_erase_subtoken: any,are,as,at,be,became,become,but,by,can,could,did,do
filter_erase_subtoken: does,each,either,else,for,from,had,has,have,hence,how
filter_erase_subtoken: i,if,in,is,it,its,just,may,maybe,me,might,mine,must,my
filter_erase_subtoken: mine,must,my,neither,nor,not,of,oh,ok,the,to,when,where
filter_erase_subtoken: whereas,wherever,whenever,whether,which,while,who,whom
filter_erase_subtoken: whoever,whose,why,will,with,within,without,would,yes
filter_erase_subtoken: yet,you,your

# This setting lets you list all tokens that will only be indexed on direct
# (whole string) matches. Each token is comma-separated, and the configuration
# option may be listed multiple times as well, each subsequent option appends to
# the previously defined list. All tokens are case-insensitive.

filter_no_subtoken: makes for,funny business

# Sets the minimum number of words in a sub-token. You may use this if you don't
# want to index single-words unless they are a direct match to the subject (in 
# which case, you'd set this to a minimum of 2 words) - or you may even want a
# higher minimum than that if you're really wanting to optimize search speed and
# disk usage.

minimum_subtoken_words: 2
Implement various filter options, some bugfixes. 2021-10-12 23:41:03 +00:00
			`#####################################`
			`# NNTP server configuration details #`
			`#####################################`

			`nntp_server_host: my.new.server.example.com`
Configuration file, arg parsing, database serialization,... 2021-09-21 00:48:49 +00:00			`nntp_server_port: 119`
Implement various filter options, some bugfixes. 2021-10-12 23:41:03 +00:00			`nntp_server_user: someuser`
			`nntp_server_pass: changeme`
Configuration file, arg parsing, database serialization,... 2021-09-21 00:48:49 +00:00			`nntp_server_use_ssl: no`

Implement various filter options, some bugfixes. 2021-10-12 23:41:03 +00:00			`#################`
			`# Path settings #`
			`#################`

			`# database location (relative or absolute path)`
Configuration file, arg parsing, database serialization,... 2021-09-21 00:48:49 +00:00			`database_path: ./db`
btree storage, multithreaded processing 2021-09-29 23:52:54 +00:00
Implement various filter options, some bugfixes. 2021-10-12 23:41:03 +00:00			`####################`
			`# Storage settings #`
			`####################`

			`# A higher tree depth creates more search tokens, so it improves the speed and`
			`# likelyhood of finding search results, at the cost of extra storage`
			`# requirements, more files, slower indexing.`

			`max_tree_depth: 10`

			`################################`
			`# Parallel processing settings #`
			`################################`

			`# If you're processing headers faster than you can pull them down over the`
			`# network, you're likely not going to need more than 1 or 2 threads, but`
			`# otherwise, more threads can help. The batch size should be large enough such`
			`# that all configured threads have enough work to do.`
			`#`
			`# The higher your tree max_tree_depth, the more likely you'll need to increase`
			`# this.`

			`max_threads: 8`
			`batch_size: 1000`

			`########################`
			`# Word filter settings #`
			`########################`

			`# It's important to filter out commonly used words to avoid blowing up an index`
			`# in size. Huge indexes are going to eat a lot of disk space and slow down`
			`# searches.`

			`# This setting lists all substrings that should be erased from subjects and`
			`# search strings before they are tokenized. For instance, you might not want to`
			`# store all results for the word "the", or "in" and other stopwords.`
			`# List of strings is comma-separated and case-insensitive. Each subsequent`
			`# option appends to the previously defined list.`

			`filter_erase_subtoken: a,about,actually,almost,also,although,always,am,an,and`
			`filter_erase_subtoken: any,are,as,at,be,became,become,but,by,can,could,did,do`
			`filter_erase_subtoken: does,each,either,else,for,from,had,has,have,hence,how`
			`filter_erase_subtoken: i,if,in,is,it,its,just,may,maybe,me,might,mine,must,my`
			`filter_erase_subtoken: mine,must,my,neither,nor,not,of,oh,ok,the,to,when,where`
			`filter_erase_subtoken: whereas,wherever,whenever,whether,which,while,who,whom`
			`filter_erase_subtoken: whoever,whose,why,will,with,within,without,would,yes`
			`filter_erase_subtoken: yet,you,your`

			`# This setting lets you list all tokens that will only be indexed on direct`
			`# (whole string) matches. Each token is comma-separated, and the configuration`
			`# option may be listed multiple times as well, each subsequent option appends to`
			`# the previously defined list. All tokens are case-insensitive.`

			`filter_no_subtoken: makes for,funny business`

			`# Sets the minimum number of words in a sub-token. You may use this if you don't`
			`# want to index single-words unless they are a direct match to the subject (in`
			`# which case, you'd set this to a minimum of 2 words) - or you may even want a`
			`# higher minimum than that if you're really wanting to optimize search speed and`
			`# disk usage.`
Improve storage efficience, add token db file parser executable, etc.... 2021-10-08 20:17:22 +00:00
Implement various filter options, some bugfixes. 2021-10-12 23:41:03 +00:00			`minimum_subtoken_words: 2`