// RandomTextGenerator.cs // ------------------------------------------------------------------ // // Copyright (c) 2009 Dino Chiesa // All rights reserved. // // This code module is part of DotNetZip, a zipfile class library. // // ------------------------------------------------------------------ // // This code is licensed under the Microsoft Public License. // See the file License.txt for the license details. // More info on: http://dotnetzip.codeplex.com // // ------------------------------------------------------------------ // // last saved (in emacs): // Time-stamp: <2011-July-13 16:37:19> // // ------------------------------------------------------------------ // // This module defines a class that generates random text sequences // using a Markov chain. // // ------------------------------------------------------------------ using System; using System.Collections.Generic; using System.Text; using System.Text.RegularExpressions; using System.Net; using System.IO; using Ionic.Zip; using Microsoft.VisualStudio.TestTools.UnitTesting; using System.Threading; namespace Ionic.Zip.Tests.Utilities { public class RandomTextGenerator { static string[] uris = new string[] { // "Through the Looking Glass", by Lewis Carroll (~181k) "http://www.gutenberg.org/files/12/12.txt", // Decl of Independence (~16k) "http://www.gutenberg.org/files/16780/16780.txt", // Decl of Independence, alternative source "http://www.constitution.org/usdeclar.txt", // Section 552a of the US code - on privacy for individuals "http://www.opm.gov/feddata/usc552a.txt", // The Naval War of 1812, by Theodore Roosevelt (968k) "http://www.gutenberg.org/dirs/etext05/7trnv10.txt", // On Prayer and the Contemplative Life, by Thomas Aquinas (440k) "http://www.gutenberg.org/files/22295/22295.txt", // IETF RFC 1951 - the DEFLATE format "http://www.ietf.org/rfc/rfc1951.txt", // pkware's appnote "http://www.pkware.com/documents/casestudies/APPNOTE.TXT", }; SimpleMarkovChain markov; public RandomTextGenerator() { System.Random rnd = new System.Random(); string seedText = null; int cycles = 0; do { try { string uri= uris[rnd.Next(uris.Length)]; seedText = GetPageMarkup(uri); } catch (System.Net.WebException) { cycles++; if (cycles>8) throw; seedText = null; } } while (seedText == null); markov = new SimpleMarkovChain(seedText); } public string Generate(int length) { return markov.GenerateText(length); } private static string GetPageMarkup(string uri) { string pageData = null; using (WebClient client = new WebClient()) { pageData = client.DownloadString(uri); } return pageData; } } /// /// Implements a simple Markov chain for text. /// /// /// /// Uses a Markov chain starting with some base texts to produce /// random natural-ish text. This implementation is based on Pike's /// perl implementation, see /// http://cm.bell-labs.com/cm/cs/tpop/markov.pl /// public class SimpleMarkovChain { Dictionary> table = new Dictionary>(); System.Random rnd = new System.Random(); public SimpleMarkovChain(string seed) { string NEWLINE = "\n"; string key = NEWLINE; var sr = new StringReader(seed); string line; while ((line = sr.ReadLine()) != null) { foreach (var word in line.SplitByWords()) { var w = (word == "") ? NEWLINE : word; // newline if (word == "\r") w = NEWLINE; if (!table.ContainsKey(key)) table.Add(key, new List()); table[key].Add(w); key = w.ToLower().TrimPunctuation(); } } if (!table.ContainsKey(key)) table.Add(key, new List()); table[key].Add(NEWLINE); key = NEWLINE; } internal void Diag() { Console.WriteLine("There are {0} keys in the table", table.Keys.Count); foreach (string s in table.Keys) { string x = s.Replace("\n", "�"); var y = table[s].ToArray(); Console.WriteLine(" {0}: {1}", x, String.Join(", ", y)); } } internal void ShowList(string word) { string x = word.Replace("\n", "�"); if (table.ContainsKey(word)) { var y = table[word].ToArray(); var z = Array.ConvertAll(y, x1 => x1.Replace("\n", "�")); Console.WriteLine(" {0}: {1}", x, String.Join(", ", z)); } else Console.WriteLine(" {0}: -key not found-", x); } private List _keywords; private List keywords { get { if (_keywords == null) _keywords = new List(table.Keys); return _keywords; } } /// /// Generates random text with a minimum character length. /// /// /// /// The minimum length of text, in characters, to produce. /// public string GenerateText(int minimumLength) { var chosenStartWord = keywords[rnd.Next(keywords.Count)]; return _InternalGenerate(chosenStartWord, StopCriterion.NumberOfChars, minimumLength); } /// /// Generates random text with a minimum character length. /// /// /// /// The first sentence will start with the given start word. /// /// /// /// The minimum length of text, in characters, to produce. /// /// /// The word to start with. If this word does not exist in the /// seed text, the generation will fail. /// /// /// /// public string GenerateText(string start, int minimumLength) { return _InternalGenerate(start, StopCriterion.NumberOfChars, minimumLength); } /// /// Generate random text with a minimum number of words. /// /// /// /// The first sentence will start with the given start word. /// /// /// /// The minimum number of words of text to produce. /// /// /// The word to start with. If this word does not exist in the /// seed text, the generation will fail. /// /// /// /// public string GenerateWords(string start, int minimumWords) { return _InternalGenerate(start, StopCriterion.NumberOfWords, minimumWords); } /// /// Generate random text with a minimum number of words. /// /// /// /// The minimum number of words of text to produce. /// /// /// public string GenerateWords(int minimumWords) { var chosenStartWord = keywords[rnd.Next(keywords.Count)]; return _InternalGenerate(chosenStartWord, StopCriterion.NumberOfWords, minimumWords); } private string _InternalGenerate(string start, StopCriterion crit, int limit) { string w1 = start.ToLower(); StringBuilder sb = new StringBuilder(); sb.Append(start.Capitalize()); int consecutiveNewLines = 0; string word = null; string priorWord = null; // About the stop criteria: // we keep going til we reach the specified number of words or chars, with the added // proviso that we have to complete the in-flight sentence when the limit is reached. for (int i = 0; (crit == StopCriterion.NumberOfWords && i < limit) || (crit == StopCriterion.NumberOfChars && sb.Length < limit) || consecutiveNewLines == 0; i++) { if (table.ContainsKey(w1)) { var list = table[w1]; int ix = rnd.Next(list.Count); priorWord = word; word = list[ix]; if (word != "\n") { // capitalize if (consecutiveNewLines > 0) sb.Append(word.Capitalize()); else sb.Append(" ").Append(word); // words that end sentences get a newline if (word.EndsWith(".")) { if (consecutiveNewLines == 0 || consecutiveNewLines == 1) sb.Append("\n"); consecutiveNewLines++; } else consecutiveNewLines = 0; } w1 = word.ToLower().TrimPunctuation(); } } return sb.ToString(); } private enum StopCriterion { NumberOfWords, NumberOfChars } } public class RandomTextInputStream : Stream { RandomTextGenerator _rtg; Int64 _desiredLength; Int64 _bytesRead; System.Text.Encoding _encoding; byte[][] _randomText; System.Random _rnd; int _gnt; byte[] src = null; private static readonly int _chunkSize = 1024 * 128; private static readonly int _chunks = 48; public RandomTextInputStream(Int64 length) : this(length, System.Text.Encoding.GetEncoding("ascii")) { } public RandomTextInputStream(Int64 length, System.Text.Encoding encoding) : base() { _desiredLength = length; _rtg = new RandomTextGenerator(); _encoding = encoding; _randomText = new byte[_chunks][]; _rnd = new System.Random(); } /// /// for diagnostic purposes only /// public int GetNewTextCount { get { return _gnt; } } new public void Dispose() { Dispose(true); } /// The Dispose method protected override void Dispose(bool disposeManagedResources) { } private byte[] GetNewText() { _gnt++; int nowServing = _rnd.Next(_chunks); if (_randomText[nowServing]==null) _randomText[nowServing] = _encoding.GetBytes(_rtg.Generate(_chunkSize)); return _randomText[nowServing]; } public Int64 BytesRead { get { return _bytesRead; } } public override int Read(byte[] buffer, int offset, int count) { int bytesToReadThisTime = count; if (_desiredLength - _bytesRead < bytesToReadThisTime) bytesToReadThisTime = unchecked((int)(_desiredLength - _bytesRead)); int bytesToRead = bytesToReadThisTime; while (bytesToRead > 0) { src = GetNewText(); int bytesAvailable = src.Length; int chunksize = (bytesToRead > bytesAvailable) ? bytesAvailable : bytesToRead; Buffer.BlockCopy(src, 0, buffer, offset, chunksize); bytesToRead -= chunksize; offset += chunksize; } _bytesRead += bytesToReadThisTime; return bytesToReadThisTime; } public override void Write(byte[] buffer, int offset, int count) { throw new NotSupportedException(); } public override bool CanRead { get { return true; } } public override bool CanSeek { get { return false; } } public override bool CanWrite { get { return false; } } public override long Length { get { return _desiredLength; } } public override long Position { get { return _desiredLength - _bytesRead; } set { throw new NotSupportedException(); } } public override long Seek(long offset, System.IO.SeekOrigin origin) { throw new NotSupportedException(); } public override void SetLength(long value) { if (value < _bytesRead) throw new NotSupportedException(); _desiredLength = value; } public override void Flush() { } } }