import numpy as np |
import random |
class ListOperation(object): |
def __init__(self,list1): |
self.list1 = list1 |
# This function returns absolute frequencies of |
# occurrence for the characters in listToCheck |
def absFreq(self,listToCheck): |
absFr = [] |
for k in listToCheck: |
freq = 0 |
for j in self.list1: |
if j == k: |
freq += 1 |
absFr.append(freq) |
return absFr |
# This function returns relative frequencies of |
# occurrence for the characters in listToCheck |
def relFreq(self,listToCheck): |
absFreq = self.absFreq(listToCheck) |
relFr = [] |
for i in absFreq: |
relFr.append(i/sum(absFreq)) |
return relFr |
# This function returns a list of the letters |
# next to the one entered |
def getNext(self,char): |
postChar = [] |
for i in self.list1: |
for k in range(len(i)): |
if i[k] == char: |
try: |
postChar.append(i[k+1]) |
except: |
postChar.append(" ") |
return postChar |
# This function returns an object of this class |
# containing a list of the first letters in the text |
def getFirstLetters(self): |
firstList = [] |
for i in self.list1: |
firstList.append(i[0]) |
listF = ListOperation(firstList) |
return listF |
# This function returns an object of this class |
# containing a list of the last letters in the text |
def getLastLetters(self): |
lastLetters = [] |
for i in self.list1: |
length = len(i) |
lastLetters.append(i[length-1]) |
listU = ListOperation(lastLetters) |
return listU |
# This function returns all the elemets of the list in |
# lower case |
def lowerCase(self): |
listLower = [] |
for i in self.list1: |
listLower.append(i.lower()) |
return listLower |
# Characters used to build a distribution |
alphabet = ["a","b","c","d","e","f","g","h","i","k","l",\ |
"m","n","o","p","q","r","s","t","u","v","w","x","z"] |
# Languages supported |
languages = ["english","italian","french","german"] |
# This function reads data and returns a string |
# you need to load a .txt document containing a |
# sample of the language you selected. The longer |
# the sample, the better word composition. |
def readData(name): |
file = open("C:\\text.txt","r") |
data = file.read() |
file.close() |
return data |
# Text uploaded |
textToCheck = readData("english") |
# The text is splitted |
textToCheckList = textToCheck.split() |
# Create an instance of the class |
textObjectList = ListOperation(textToCheckList) |
#------------------------------------------ |
# Let's find all the characters after each one |
# of those in the alphabet list |
postSequences = [] |
for letter in alphabet: |
seqPost = textObjectList.getNext(letter) |
postSequences.append(seqPost) |
# For each postSequence we find the absolute frequency |
distAbs = [] |
for seq in postSequences: |
distAObject = ListOperation(seq) |
distA = distAObject.absFreq(alphabet) |
distAbs.append(distA) |
# And the relative frequency. However, we |
# must be caareful and delete those which |
# sum up to 0 to avoid exceptions with np |
distRel = [] |
for seq in postSequences: |
distRObject = ListOperation(seq) |
if sum(distRObject.absFreq(alphabet)) == 0: |
pass |
else: |
distR = distRObject.relFreq(alphabet) |
distRel.append(distR) |
#------------------------------------------ |
# List of letters at the beginning of a word |
listFirstLetters = ListOperation(textObjectList.getFirstLetters().lowerCase()) |
# Distribution of the letter at the beginning of a word |
absFreqFirstLetters = listFirstLetters.absFreq(alphabet) |
relFreqFirstLetters = listFirstLetters.relFreq(alphabet) |
#------------------------------------------ |
# List of letters at the end of a word |
listLastLetters = ListOperation(textObjectList.getLastLetters().list1) |
# Distribution of the letter at the end of a word |
absFreqLastLetters = listLastLetters.absFreq(alphabet) |
relFreqLastLetters = listLastLetters.relFreq(alphabet) |
#------------------------------------------ |
# This function returns a list containing |
# the length of each word in the string |
def findWordLengDist(string): |
wordList = string.split() |
seqLength = [] |
for word in wordList: |
if len(word) <= 18: |
seqLength.append(len(word)) |
return seqLength |
# This function returns the probability |
# distribution of the characters in the |
# string used as input |
def generateDist(list1): |
lengthsList = [] |
for number in list1: |
if number not in lengthsList: |
lengthsList.append(number) |
lengthsList.sort() |
nLengthsList = [] |
for number in lengthsList: |
frequency = 0 |
for occurringNumber in list1: |
if number == occurringNumber: |
frequency += 1 |
nLengthsList.append(frequency) |
# get relative frequency |
relFreq = [] |
for i in nLengthsList: |
relFreq.append(i/sum(nLengthsList)) |
# thislist = [lunghezza parole, frequenza relativa] |
dataToReturn = [lengthsList,relFreq] |
# These two should be equal |
if len(lengthsList) != len(relFreq): |
raise ValueError("""The two final lists have diff |
erent length, please check the algorithm""") |
return dataToReturn |
########################################## |
# Word composer function |
########################################## |
# This function outputs strings whose length |
# is similar to the one of the real ones. |
# Also, the structure should be at least similar |
def spitText(): |
try: |
word = "" |
# Initial character is chosen based on the |
# distribution (oldChar) of the first |
# characters observed in the text given |
oldChar = np.random.choice(alphabet,replace=True,p=relFreqFirstLetters) |
word = word + oldChar |
for k in range(np.random.choice(distributions[0],replace=True,p=distributions[1])-1): |
newChar = np.random.choice(alphabet,replace=True,p=distRel[alphabet.index(oldChar)]) |
word = word + newChar |
oldChar = newChar |
# Same thing for the last character |
lastChar = np.random.choice(alphabet,replace=True,p=relFreqLastLetters) |
word += lastChar |
print(word) |
return word |
except Exception as e: |
print("Exception",e) |
return "0" |
distributions = generateDist(findWordLengDist(textToCheck)) |
generatedWords = [] |
# Generate 100 words |
for i in range(100): |
word = spitText() |
generatedWords.append(word) |
# And print them out to the screen |
print(generatedWords) |
# Checking existance of words with nltk.corpus |
# and python 2.7. This works only with Python 2 |
from nltk.corpus import words |
# Example of word existance checking: |
# True if the word exists, False otherwise |
"word" in words.words() |
# Load words from file |
file = open("C:\\words.txt","r") |
wordsLoaded = file.read().split("\n") |
realWords = [] |
for word in wordsLoaded: |
if word in words.words(): |
realWords.append(word) |
print(realWords) |