In general, computer programs are believed to be bad at being creative and doing tasks which require “a human mind”. Dealing with the meaning of text, words and sentences is one of these tasks. That’s not always the case. For instance sentiment analysis is a branch of Machine Learning where computer programs try to convey the overall feeling coming from tons of articles.
But here we are talking a lot more simpler: by using Markov chains and some statistics, I developed a simple computer program which generates words. The model works as follow:
As a first step, the program is fed with a long text in the selected language. The longer the text, the better. Next, the program analyses each word and gets the probability distributions for the following:
-Length of the word
-First character (or letter)
-Character (or letter) next to a given one
-Last character (or letter)
Then, once gathered all this data, the following function is run in order to generate words:
Each part is then glued together and returned by the function.
On average, 10% of generated words are English (or French, German or Italian) real words or prepositions or some kind of real sequence of characters.
The most interesting aspect, in my opinion, is the fact that by shifting the language of the text fed into the program, one can clearly see how the sequences of characters change dramatically, for instance, by feeding English, the program will be more likely to write “th” in words, by using German words will often contain “ge” and by using Italian words will often end by a vowel.
Here is the code. Note that in order to speed up the word checking process, I had to use the NLTK package which is availably only in Python 2. To avoid the use of Python 2 you could check each word using urllib and an online dictionary by parsing the web page but this way is tediously slow. It would take about 15 minutes to check 1000 words. By using NLTK you can speed up the checking process.
import numpy as np | |
import random | |
class ListOperation(object): | |
def __init__(self,list1): | |
self.list1 = list1 | |
# This function returns absolute frequencies of | |
# occurrence for the characters in listToCheck | |
def absFreq(self,listToCheck): | |
absFr = [] | |
for k in listToCheck: | |
freq = 0 | |
for j in self.list1: | |
if j == k: | |
freq += 1 | |
absFr.append(freq) | |
return absFr | |
# This function returns relative frequencies of | |
# occurrence for the characters in listToCheck | |
def relFreq(self,listToCheck): | |
absFreq = self.absFreq(listToCheck) | |
relFr = [] | |
for i in absFreq: | |
relFr.append(i/sum(absFreq)) | |
return relFr | |
# This function returns a list of the letters | |
# next to the one entered | |
def getNext(self,char): | |
postChar = [] | |
for i in self.list1: | |
for k in range(len(i)): | |
if i[k] == char: | |
try: | |
postChar.append(i[k+1]) | |
except: | |
postChar.append(" ") | |
return postChar | |
# This function returns an object of this class | |
# containing a list of the first letters in the text | |
def getFirstLetters(self): | |
firstList = [] | |
for i in self.list1: | |
firstList.append(i[0]) | |
listF = ListOperation(firstList) | |
return listF | |
# This function returns an object of this class | |
# containing a list of the last letters in the text | |
def getLastLetters(self): | |
lastLetters = [] | |
for i in self.list1: | |
length = len(i) | |
lastLetters.append(i[length-1]) | |
listU = ListOperation(lastLetters) | |
return listU | |
# This function returns all the elemets of the list in | |
# lower case | |
def lowerCase(self): | |
listLower = [] | |
for i in self.list1: | |
listLower.append(i.lower()) | |
return listLower | |
# Characters used to build a distribution | |
alphabet = ["a","b","c","d","e","f","g","h","i","k","l",\ | |
"m","n","o","p","q","r","s","t","u","v","w","x","z"] | |
# Languages supported | |
languages = ["english","italian","french","german"] | |
# This function reads data and returns a string | |
# you need to load a .txt document containing a | |
# sample of the language you selected. The longer | |
# the sample, the better word composition. | |
def readData(name): | |
file = open("C:\\text.txt","r") | |
data = file.read() | |
file.close() | |
return data | |
# Text uploaded | |
textToCheck = readData("english") | |
# The text is splitted | |
textToCheckList = textToCheck.split() | |
# Create an instance of the class | |
textObjectList = ListOperation(textToCheckList) | |
#------------------------------------------ | |
# Let's find all the characters after each one | |
# of those in the alphabet list | |
postSequences = [] | |
for letter in alphabet: | |
seqPost = textObjectList.getNext(letter) | |
postSequences.append(seqPost) | |
# For each postSequence we find the absolute frequency | |
distAbs = [] | |
for seq in postSequences: | |
distAObject = ListOperation(seq) | |
distA = distAObject.absFreq(alphabet) | |
distAbs.append(distA) | |
# And the relative frequency. However, we | |
# must be caareful and delete those which | |
# sum up to 0 to avoid exceptions with np | |
distRel = [] | |
for seq in postSequences: | |
distRObject = ListOperation(seq) | |
if sum(distRObject.absFreq(alphabet)) == 0: | |
pass | |
print("SOMETHING NOT ADDING TO 1") | |
else: | |
distR = distRObject.relFreq(alphabet) | |
distRel.append(distR) | |
#------------------------------------------ | |
# List of letters at the beginning of a word | |
listFirstLetters = ListOperation(textObjectList.getFirstLetters().lowerCase()) | |
# Distribution of the letter at the beginning of a word | |
absFreqFirstLetters = listFirstLetters.absFreq(alphabet) | |
relFreqFirstLetters = listFirstLetters.relFreq(alphabet) | |
#------------------------------------------ | |
# List of letters at the end of a word | |
listLastLetters = ListOperation(textObjectList.getLastLetters().list1) | |
# Distribution of the letter at the end of a word | |
absFreqLastLetters = listLastLetters.absFreq(alphabet) | |
relFreqLastLetters = listLastLetters.relFreq(alphabet) | |
#------------------------------------------ | |
# This function returns a list containing | |
# the length of each word in the string | |
def findWordLengDist(string): | |
wordList = string.split() | |
seqLength = [] | |
for word in wordList: | |
if len(word) <= 18: | |
seqLength.append(len(word)) | |
return seqLength | |
# This function returns the probability | |
# distribution of the characters in the | |
# string used as input | |
def generateDist(list1): | |
lengthsList = [] | |
for number in list1: | |
if number not in lengthsList: | |
lengthsList.append(number) | |
lengthsList.sort() | |
nLengthsList = [] | |
for number in lengthsList: | |
frequency = 0 | |
for occurringNumber in list1: | |
if number == occurringNumber: | |
frequency += 1 | |
nLengthsList.append(frequency) | |
# get relative frequency | |
relFreq = [] | |
for i in nLengthsList: | |
relFreq.append(i/sum(nLengthsList)) | |
# thislist = [lunghezza parole, frequenza relativa] | |
dataToReturn = [lengthsList,relFreq] | |
# These two should be equal | |
if len(lengthsList) != len(relFreq): | |
raise ValueError("""The two final lists have diff | |
erent length, please check the algorithm""") | |
return dataToReturn | |
########################################## | |
# Word composer function | |
########################################## | |
# This function outputs strings whose length | |
# is similar to the one of the real ones. | |
# Also, the structure should be at least similar | |
def spitText(): | |
try: | |
word = "" | |
# Initial character is chosen based on the | |
# distribution (oldChar) of the first | |
# characters observed in the text given | |
oldChar = np.random.choice(alphabet,replace=True,p=relFreqFirstLetters) | |
word = word + oldChar | |
for k in range(np.random.choice(distributions[0],replace=True,p=distributions[1])-1): | |
newChar = np.random.choice(alphabet,replace=True,p=distRel[alphabet.index(oldChar)]) | |
word = word + newChar | |
oldChar = newChar | |
# Same thing for the last character | |
lastChar = np.random.choice(alphabet,replace=True,p=relFreqLastLetters) | |
word += lastChar | |
print(word) | |
return word | |
except Exception as e: | |
print("Exception",e) | |
return "0" | |
distributions = generateDist(findWordLengDist(textToCheck)) | |
generatedWords = [] | |
# Generate 100 words | |
for i in range(100): | |
word = spitText() | |
generatedWords.append(word) | |
# And print them out to the screen | |
print(generatedWords) | |
# Checking existance of words with nltk.corpus | |
# and python 2.7. This works only with Python 2 | |
from nltk.corpus import words | |
# Example of word existance checking: | |
# True if the word exists, False otherwise | |
"word" in words.words() | |
# Load words from file | |
file = open("C:\\words.txt","r") | |
wordsLoaded = file.read().split("\n") | |
realWords = [] | |
for word in wordsLoaded: | |
if word in words.words(): | |
realWords.append(word) | |
print(realWords) |
Hope this was interesting.
Buy Speakers at Cheapest Price with Amazing Quality
ReplyDeletehttps://seventysevenstyle.com/collections/wireless-products/products/t90-5-in-1-4d-wireless-bluetooth-speaker-with-dancing-5-led-lights