|
import numpy as np |
|
import random |
|
|
|
class ListOperation(object): |
|
|
|
def __init__(self,list1): |
|
self.list1 = list1 |
|
# This function returns absolute frequencies of |
|
# occurrence for the characters in listToCheck |
|
def absFreq(self,listToCheck): |
|
absFr = [] |
|
for k in listToCheck: |
|
freq = 0 |
|
for j in self.list1: |
|
if j == k: |
|
freq += 1 |
|
absFr.append(freq) |
|
return absFr |
|
|
|
# This function returns relative frequencies of |
|
# occurrence for the characters in listToCheck |
|
def relFreq(self,listToCheck): |
|
absFreq = self.absFreq(listToCheck) |
|
relFr = [] |
|
for i in absFreq: |
|
relFr.append(i/sum(absFreq)) |
|
return relFr |
|
|
|
# This function returns a list of the letters |
|
# next to the one entered |
|
def getNext(self,char): |
|
postChar = [] |
|
for i in self.list1: |
|
for k in range(len(i)): |
|
if i[k] == char: |
|
try: |
|
postChar.append(i[k+1]) |
|
except: |
|
postChar.append(" ") |
|
return postChar |
|
|
|
# This function returns an object of this class |
|
# containing a list of the first letters in the text |
|
def getFirstLetters(self): |
|
firstList = [] |
|
for i in self.list1: |
|
firstList.append(i[0]) |
|
listF = ListOperation(firstList) |
|
return listF |
|
|
|
# This function returns an object of this class |
|
# containing a list of the last letters in the text |
|
def getLastLetters(self): |
|
lastLetters = [] |
|
for i in self.list1: |
|
length = len(i) |
|
lastLetters.append(i[length-1]) |
|
listU = ListOperation(lastLetters) |
|
return listU |
|
|
|
# This function returns all the elemets of the list in |
|
# lower case |
|
def lowerCase(self): |
|
listLower = [] |
|
for i in self.list1: |
|
listLower.append(i.lower()) |
|
return listLower |
|
|
|
# Characters used to build a distribution |
|
alphabet = ["a","b","c","d","e","f","g","h","i","k","l",\ |
|
"m","n","o","p","q","r","s","t","u","v","w","x","z"] |
|
# Languages supported |
|
languages = ["english","italian","french","german"] |
|
|
|
# This function reads data and returns a string |
|
# you need to load a .txt document containing a |
|
# sample of the language you selected. The longer |
|
# the sample, the better word composition. |
|
def readData(name): |
|
file = open("C:\\text.txt","r") |
|
data = file.read() |
|
file.close() |
|
return data |
|
|
|
# Text uploaded |
|
textToCheck = readData("english") |
|
|
|
# The text is splitted |
|
textToCheckList = textToCheck.split() |
|
|
|
# Create an instance of the class |
|
textObjectList = ListOperation(textToCheckList) |
|
|
|
#------------------------------------------ |
|
# Let's find all the characters after each one |
|
# of those in the alphabet list |
|
postSequences = [] |
|
for letter in alphabet: |
|
seqPost = textObjectList.getNext(letter) |
|
postSequences.append(seqPost) |
|
|
|
# For each postSequence we find the absolute frequency |
|
distAbs = [] |
|
for seq in postSequences: |
|
distAObject = ListOperation(seq) |
|
distA = distAObject.absFreq(alphabet) |
|
distAbs.append(distA) |
|
|
|
# And the relative frequency. However, we |
|
# must be caareful and delete those which |
|
# sum up to 0 to avoid exceptions with np |
|
distRel = [] |
|
for seq in postSequences: |
|
distRObject = ListOperation(seq) |
|
if sum(distRObject.absFreq(alphabet)) == 0: |
|
pass |
|
print("SOMETHING NOT ADDING TO 1") |
|
else: |
|
distR = distRObject.relFreq(alphabet) |
|
distRel.append(distR) |
|
|
|
#------------------------------------------ |
|
# List of letters at the beginning of a word |
|
listFirstLetters = ListOperation(textObjectList.getFirstLetters().lowerCase()) |
|
|
|
# Distribution of the letter at the beginning of a word |
|
absFreqFirstLetters = listFirstLetters.absFreq(alphabet) |
|
relFreqFirstLetters = listFirstLetters.relFreq(alphabet) |
|
#------------------------------------------ |
|
# List of letters at the end of a word |
|
listLastLetters = ListOperation(textObjectList.getLastLetters().list1) |
|
|
|
# Distribution of the letter at the end of a word |
|
absFreqLastLetters = listLastLetters.absFreq(alphabet) |
|
relFreqLastLetters = listLastLetters.relFreq(alphabet) |
|
#------------------------------------------ |
|
# This function returns a list containing |
|
# the length of each word in the string |
|
def findWordLengDist(string): |
|
wordList = string.split() |
|
seqLength = [] |
|
for word in wordList: |
|
if len(word) <= 18: |
|
seqLength.append(len(word)) |
|
return seqLength |
|
|
|
# This function returns the probability |
|
# distribution of the characters in the |
|
# string used as input |
|
def generateDist(list1): |
|
lengthsList = [] |
|
for number in list1: |
|
if number not in lengthsList: |
|
lengthsList.append(number) |
|
lengthsList.sort() |
|
|
|
nLengthsList = [] |
|
for number in lengthsList: |
|
frequency = 0 |
|
for occurringNumber in list1: |
|
if number == occurringNumber: |
|
frequency += 1 |
|
nLengthsList.append(frequency) |
|
|
|
# get relative frequency |
|
relFreq = [] |
|
for i in nLengthsList: |
|
relFreq.append(i/sum(nLengthsList)) |
|
|
|
# thislist = [lunghezza parole, frequenza relativa] |
|
dataToReturn = [lengthsList,relFreq] |
|
# These two should be equal |
|
if len(lengthsList) != len(relFreq): |
|
raise ValueError("""The two final lists have diff |
|
erent length, please check the algorithm""") |
|
return dataToReturn |
|
|
|
########################################## |
|
# Word composer function |
|
########################################## |
|
# This function outputs strings whose length |
|
# is similar to the one of the real ones. |
|
# Also, the structure should be at least similar |
|
def spitText(): |
|
try: |
|
word = "" |
|
# Initial character is chosen based on the |
|
# distribution (oldChar) of the first |
|
# characters observed in the text given |
|
oldChar = np.random.choice(alphabet,replace=True,p=relFreqFirstLetters) |
|
word = word + oldChar |
|
for k in range(np.random.choice(distributions[0],replace=True,p=distributions[1])-1): |
|
newChar = np.random.choice(alphabet,replace=True,p=distRel[alphabet.index(oldChar)]) |
|
word = word + newChar |
|
oldChar = newChar |
|
# Same thing for the last character |
|
lastChar = np.random.choice(alphabet,replace=True,p=relFreqLastLetters) |
|
word += lastChar |
|
print(word) |
|
return word |
|
except Exception as e: |
|
print("Exception",e) |
|
return "0" |
|
|
|
distributions = generateDist(findWordLengDist(textToCheck)) |
|
|
|
generatedWords = [] |
|
# Generate 100 words |
|
for i in range(100): |
|
word = spitText() |
|
generatedWords.append(word) |
|
|
|
# And print them out to the screen |
|
print(generatedWords) |
|
|
|
# Checking existance of words with nltk.corpus |
|
# and python 2.7. This works only with Python 2 |
|
|
|
from nltk.corpus import words |
|
|
|
# Example of word existance checking: |
|
# True if the word exists, False otherwise |
|
"word" in words.words() |
|
|
|
|
|
# Load words from file |
|
file = open("C:\\words.txt","r") |
|
wordsLoaded = file.read().split("\n") |
|
|
|
realWords = [] |
|
|
|
for word in wordsLoaded: |
|
if word in words.words(): |
|
realWords.append(word) |
|
|
|
print(realWords) |