// RandomTextGenerator.cs
// ------------------------------------------------------------------
//
// Copyright (c) 2009 Dino Chiesa
// All rights reserved.
//
// This code module is part of DotNetZip, a zipfile class library.
//
// ------------------------------------------------------------------
//
// This code is licensed under the Microsoft Public License.
// See the file License.txt for the license details.
// More info on: http://dotnetzip.codeplex.com
//
// ------------------------------------------------------------------
//
// last saved (in emacs):
// Time-stamp: <2011-July-13 16:37:19>
//
// ------------------------------------------------------------------
//
// This module defines a class that generates random text sequences
// using a Markov chain.
//
// ------------------------------------------------------------------
using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using System.Net;
using System.IO;
using Ionic.Zip;
using Microsoft.VisualStudio.TestTools.UnitTesting;
using System.Threading;
namespace Ionic.Zip.Tests.Utilities
{
public class RandomTextGenerator
{
static string[] uris = new string[]
{
// "Through the Looking Glass", by Lewis Carroll (~181k)
"http://www.gutenberg.org/files/12/12.txt",
// Decl of Independence (~16k)
"http://www.gutenberg.org/files/16780/16780.txt",
// Decl of Independence, alternative source
"http://www.constitution.org/usdeclar.txt",
// Section 552a of the US code - on privacy for individuals
"http://www.opm.gov/feddata/usc552a.txt",
// The Naval War of 1812, by Theodore Roosevelt (968k)
"http://www.gutenberg.org/dirs/etext05/7trnv10.txt",
// On Prayer and the Contemplative Life, by Thomas Aquinas (440k)
"http://www.gutenberg.org/files/22295/22295.txt",
// IETF RFC 1951 - the DEFLATE format
"http://www.ietf.org/rfc/rfc1951.txt",
// pkware's appnote
"http://www.pkware.com/documents/casestudies/APPNOTE.TXT",
};
SimpleMarkovChain markov;
public RandomTextGenerator()
{
System.Random rnd = new System.Random();
string seedText = null;
int cycles = 0;
do {
try
{
string uri= uris[rnd.Next(uris.Length)];
seedText = GetPageMarkup(uri);
}
catch (System.Net.WebException)
{
cycles++;
if (cycles>8) throw;
seedText = null;
}
} while (seedText == null);
markov = new SimpleMarkovChain(seedText);
}
public string Generate(int length)
{
return markov.GenerateText(length);
}
private static string GetPageMarkup(string uri)
{
string pageData = null;
using (WebClient client = new WebClient())
{
pageData = client.DownloadString(uri);
}
return pageData;
}
}
///
/// Implements a simple Markov chain for text.
///
///
///
/// Uses a Markov chain starting with some base texts to produce
/// random natural-ish text. This implementation is based on Pike's
/// perl implementation, see
/// http://cm.bell-labs.com/cm/cs/tpop/markov.pl
///
public class SimpleMarkovChain
{
Dictionary> table = new Dictionary>();
System.Random rnd = new System.Random();
public SimpleMarkovChain(string seed)
{
string NEWLINE = "\n";
string key = NEWLINE;
var sr = new StringReader(seed);
string line;
while ((line = sr.ReadLine()) != null)
{
foreach (var word in line.SplitByWords())
{
var w = (word == "") ? NEWLINE : word; // newline
if (word == "\r") w = NEWLINE;
if (!table.ContainsKey(key)) table.Add(key, new List());
table[key].Add(w);
key = w.ToLower().TrimPunctuation();
}
}
if (!table.ContainsKey(key)) table.Add(key, new List());
table[key].Add(NEWLINE);
key = NEWLINE;
}
internal void Diag()
{
Console.WriteLine("There are {0} keys in the table", table.Keys.Count);
foreach (string s in table.Keys)
{
string x = s.Replace("\n", "�");
var y = table[s].ToArray();
Console.WriteLine(" {0}: {1}", x, String.Join(", ", y));
}
}
internal void ShowList(string word)
{
string x = word.Replace("\n", "�");
if (table.ContainsKey(word))
{
var y = table[word].ToArray();
var z = Array.ConvertAll(y, x1 => x1.Replace("\n", "�"));
Console.WriteLine(" {0}: {1}", x, String.Join(", ", z));
}
else
Console.WriteLine(" {0}: -key not found-", x);
}
private List _keywords;
private List keywords
{
get
{
if (_keywords == null)
_keywords = new List(table.Keys);
return _keywords;
}
}
///
/// Generates random text with a minimum character length.
///
///
///
/// The minimum length of text, in characters, to produce.
///
public string GenerateText(int minimumLength)
{
var chosenStartWord = keywords[rnd.Next(keywords.Count)];
return _InternalGenerate(chosenStartWord, StopCriterion.NumberOfChars, minimumLength);
}
///
/// Generates random text with a minimum character length.
///
///
///
/// The first sentence will start with the given start word.
///
///
///
/// The minimum length of text, in characters, to produce.
///
///
/// The word to start with. If this word does not exist in the
/// seed text, the generation will fail.
///
///
///
///
public string GenerateText(string start, int minimumLength)
{
return _InternalGenerate(start, StopCriterion.NumberOfChars, minimumLength);
}
///
/// Generate random text with a minimum number of words.
///
///
///
/// The first sentence will start with the given start word.
///
///
///
/// The minimum number of words of text to produce.
///
///
/// The word to start with. If this word does not exist in the
/// seed text, the generation will fail.
///
///
///
///
public string GenerateWords(string start, int minimumWords)
{
return _InternalGenerate(start, StopCriterion.NumberOfWords, minimumWords);
}
///
/// Generate random text with a minimum number of words.
///
///
///
/// The minimum number of words of text to produce.
///
///
///
public string GenerateWords(int minimumWords)
{
var chosenStartWord = keywords[rnd.Next(keywords.Count)];
return _InternalGenerate(chosenStartWord, StopCriterion.NumberOfWords, minimumWords);
}
private string _InternalGenerate(string start, StopCriterion crit, int limit)
{
string w1 = start.ToLower();
StringBuilder sb = new StringBuilder();
sb.Append(start.Capitalize());
int consecutiveNewLines = 0;
string word = null;
string priorWord = null;
// About the stop criteria:
// we keep going til we reach the specified number of words or chars, with the added
// proviso that we have to complete the in-flight sentence when the limit is reached.
for (int i = 0;
(crit == StopCriterion.NumberOfWords && i < limit) ||
(crit == StopCriterion.NumberOfChars && sb.Length < limit) ||
consecutiveNewLines == 0;
i++)
{
if (table.ContainsKey(w1))
{
var list = table[w1];
int ix = rnd.Next(list.Count);
priorWord = word;
word = list[ix];
if (word != "\n")
{
// capitalize
if (consecutiveNewLines > 0)
sb.Append(word.Capitalize());
else
sb.Append(" ").Append(word);
// words that end sentences get a newline
if (word.EndsWith("."))
{
if (consecutiveNewLines == 0 || consecutiveNewLines == 1)
sb.Append("\n");
consecutiveNewLines++;
}
else consecutiveNewLines = 0;
}
w1 = word.ToLower().TrimPunctuation();
}
}
return sb.ToString();
}
private enum StopCriterion
{
NumberOfWords,
NumberOfChars
}
}
public class RandomTextInputStream : Stream
{
RandomTextGenerator _rtg;
Int64 _desiredLength;
Int64 _bytesRead;
System.Text.Encoding _encoding;
byte[][] _randomText;
System.Random _rnd;
int _gnt;
byte[] src = null;
private static readonly int _chunkSize = 1024 * 128;
private static readonly int _chunks = 48;
public RandomTextInputStream(Int64 length)
: this(length, System.Text.Encoding.GetEncoding("ascii"))
{
}
public RandomTextInputStream(Int64 length, System.Text.Encoding encoding)
: base()
{
_desiredLength = length;
_rtg = new RandomTextGenerator();
_encoding = encoding;
_randomText = new byte[_chunks][];
_rnd = new System.Random();
}
///
/// for diagnostic purposes only
///
public int GetNewTextCount
{
get
{
return _gnt;
}
}
new public void Dispose()
{
Dispose(true);
}
/// The Dispose method
protected override void Dispose(bool disposeManagedResources)
{
}
private byte[] GetNewText()
{
_gnt++;
int nowServing = _rnd.Next(_chunks);
if (_randomText[nowServing]==null)
_randomText[nowServing] = _encoding.GetBytes(_rtg.Generate(_chunkSize));
return _randomText[nowServing];
}
public Int64 BytesRead
{
get { return _bytesRead; }
}
public override int Read(byte[] buffer, int offset, int count)
{
int bytesToReadThisTime = count;
if (_desiredLength - _bytesRead < bytesToReadThisTime)
bytesToReadThisTime = unchecked((int)(_desiredLength - _bytesRead));
int bytesToRead = bytesToReadThisTime;
while (bytesToRead > 0)
{
src = GetNewText();
int bytesAvailable = src.Length;
int chunksize = (bytesToRead > bytesAvailable)
? bytesAvailable
: bytesToRead;
Buffer.BlockCopy(src, 0, buffer, offset, chunksize);
bytesToRead -= chunksize;
offset += chunksize;
}
_bytesRead += bytesToReadThisTime;
return bytesToReadThisTime;
}
public override void Write(byte[] buffer, int offset, int count)
{
throw new NotSupportedException();
}
public override bool CanRead
{
get { return true; }
}
public override bool CanSeek
{
get { return false; }
}
public override bool CanWrite
{
get { return false; }
}
public override long Length
{
get { return _desiredLength; }
}
public override long Position
{
get { return _desiredLength - _bytesRead; }
set
{
throw new NotSupportedException();
}
}
public override long Seek(long offset, System.IO.SeekOrigin origin)
{
throw new NotSupportedException();
}
public override void SetLength(long value)
{
if (value < _bytesRead)
throw new NotSupportedException();
_desiredLength = value;
}
public override void Flush()
{
}
}
}