ToeCracker/ToeCracker/Thesaurus.cs

153 lines
7.4 KiB
C#

using System;
using System.Collections.Generic;
using System.IO;
using System.Net;
using System.Windows.Forms;
using System.Xml;
using System.Text;
namespace ToeCracker {
/// <summary>
/// Methods to look up a word in the thesaurus and parse the results.
/// </summary>
public static class Thesaurus {
private const string RESULT_HEADER = "<div class=\"results\">";
private const string RESULT_FOOTER = "</div>";
/// <summary>
/// A dictionary mapping English word group keys with translated values.
/// Some keys might be missing, in which case they should be displayed as they are;
/// you might want to open an issue to request those translations to be added.
/// </summary>
public static Dictionary<string, string> Translations {
get {
Dictionary<string, string> tr = new Dictionary<string, string>();
tr["adjective"] = Properties.Resources.Adjective;
tr["adverb"] = Properties.Resources.Verb;
tr["antonyms"] = Properties.Resources.Antonyms;
tr["noun"] = Properties.Resources.Noun;
tr["related terms"] = Properties.Resources.RelatedTerms;
tr["rhymes with"] = Properties.Resources.RhymesWith;
tr["similar terms"] = Properties.Resources.SimilarTerms;
tr["sounds kind of like"] = Properties.Resources.SoundsLike;
tr["verb"] = Properties.Resources.Verb;
return tr;
}
}
private static Uri BuildURI(string query) {
return new Uri(String.Format(Properties.Settings.Default.BaseURL, query), UriKind.Absolute);
}
/// <summary>
/// Look up a word in the thesaurus and return the results.
/// </summary>
/// <param name="query">Word to look for.</param>
/// <returns>A dictionary holding an array of words for each word group.</returns>
public static Dictionary<string, string[]> Search(string query) {
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(BuildURI(query));
request.UserAgent = String.Format("{0}/{1}", Application.ProductName, Application.ProductVersion);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
if ((int)response.StatusCode < 200 || (int)response.StatusCode > 299)
throw new WebException(String.Format("Unexpected HTTP status code {0}", response.StatusCode));
XmlDocument doc = new XmlDocument();
using (Stream stream = response.GetResponseStream()) {
/*
* The HTML response is not XHTML, so we cannot parse it as XML at first.
* However, the search results (including a "Not found" error) are in a div tag
* whose contents can be parsed using an XML parser.
* We will look for the beginning of this <div> tag in the stream, read until the </div>,
* then parse it as XML.
*/
using (StreamReader reader = new StreamReader(stream)) {
StringBuilder sb = new StringBuilder(RESULT_HEADER);
string line;
while ((line = reader.ReadLine()) != null) {
if (line.Contains(RESULT_HEADER))
break;
}
if (line == null)
throw new XmlException("Result block not found.");
while ((line = reader.ReadLine()) != null) {
sb.AppendLine(line);
if (line.Contains(RESULT_FOOTER))
break;
}
if (line == null)
throw new XmlException("End of result block not found.");
doc.LoadXml(sb.ToString());
}
}
/*
* We can now iterate on the nodes inside of the <div>. We can expect four types of tags:
* - <h2>, which contains the word we just searched and that we can ignore
* - <h3>, the beginning of a new word group
* - <h4>, a subgroup within the word group
* - <ul>, listing words inside a new group.
* We will ignore any other tags to stay flexible.
*/
Dictionary<string, string[]> results = new Dictionary<string, string[]>();
string groupName = null;
List<string> words = new List<string>();
foreach (XmlNode node in doc.ChildNodes[0].ChildNodes) {
if (node.NodeType != XmlNodeType.Element)
continue;
// Beginning of a new word group
if (node.Name.ToLowerInvariant() == "h3") {
if (groupName != null && words.Count > 0) {
// End the current group, store it in the dictionary.
if (results.ContainsKey(groupName)) {
// Group names should only be unique, but we will tolerate duplicate group names by concatenating the word lists
List<string> newWords = new List<string>(results[groupName]);
newWords.AddRange(words);
results[groupName] = newWords.ToArray();
} else {
results[groupName] = words.ToArray();
}
words.Clear();
}
groupName = node.InnerText;
} else if (node.Name.ToLowerInvariant() == "h4") {
// We will represent subgroups by adding an empty word (an empty line in the display),
// and format the group name in uppercase with dashes: "-- ANTONYMS --"
if (groupName == null)
throw new XmlException("Unexpected subgroup name with no group name.");
words.Add("");
string title = node.InnerText.ToLowerInvariant();
if (Translations.ContainsKey(title))
title = Translations[title];
words.Add(String.Format("-- {0} --", title.ToUpperInvariant()));
} else if (node.Name.ToLowerInvariant() == "ul") {
if (groupName == null)
throw new XmlException("Unexpected word list with no group name.");
foreach (XmlNode listItem in node.ChildNodes) {
if (listItem.NodeType == XmlNodeType.Element && listItem.Name == "li")
words.Add(listItem.InnerText);
}
}
}
// Once we have finished iterating, there might be one remaining group to add to the dictionary.
if (groupName != null && words.Count > 0) {
if (results.ContainsKey(groupName)) {
// Group names should only be unique, but we will tolerate duplicate group names by concatenating the word lists
List<string> newWords = new List<string>(results[groupName]);
newWords.AddRange(words);
results[groupName] = newWords.ToArray();
} else {
results[groupName] = words.ToArray();
}
}
return results;
}
}
}