@sset wrote:
Hi,
We are attempting text classification using R navie byes. For a data set it returns same values for class posterior probability. Actually it is only calculating class prior probability.
Below is code and training data
Code
library('log4r')
#logReset()
#basicConfig()
#addHandler(writeToFile, logger="RML", file="D:/Rnlp.log", level='DEBUG')
#with(getLogger(), names(handlers))
#loginfo('test %d', 1)
#RML <- create.logger(logfile = 'C:/software/absa/textmining/nlp/logs/RML.log', level = "DEBUG")
RML <- create.logger(logfile = 'D:/absa/textmining/nlp/logs/RML.log', level = "DEBUG")computeNavieByes=function(trainingDataPath,testData,isTrainingMode) {
debug(RML,'start compute naviebyes')
out <- tryCatch(
{
library(tm)
library(e1071)testDataTokens <-unlist(strsplit(testData, "[,]"))
dataText<-read.csv(trainingDataPath,header= TRUE)
trainvector <- as.vector(dataText$Text)
trainsource <- VectorSource(trainvector)
traincorpus <- Corpus(trainsource)#REMOVE STOPWORDS
traincorpus <- tm_map(traincorpus,stripWhitespace)
traincorpus <- tm_map(traincorpus,tolower)
traincorpus <- tm_map(traincorpus, removeWords,stopwords("english"))
traincorpus<- tm_map(traincorpus,removePunctuation)
traincorpus <- tm_map(traincorpus, PlainTextDocument)# CREATE TERM DOCUMENT MATRIX
trainmatrix <- t(TermDocumentMatrix(traincorpus))
model <- naiveBayes(as.matrix(trainmatrix),as.factor(dataText$Category))
col1 <- c()
index <- 1
resultsColl <- vector()
for (valueToken in testDataTokens)
{
col1[1] <- valueToken
dataTest <- data.frame("col1"=col1)
testvector <- as.vector(dataTest)
testsource <- VectorSource(testvector)
testcorpus <- Corpus(testsource)
testcorpus <- tm_map(testcorpus,stripWhitespace)
testcorpus <- tm_map(testcorpus,tolower)
testcorpus <- tm_map(testcorpus, removeWords,stopwords("english"))
testcorpus<- tm_map(testcorpus,removePunctuation)
testcorpus <- tm_map(testcorpus, PlainTextDocument)testmatrix <- t(TermDocumentMatrix(testcorpus)) print(valueToken) results<-predict(model, as.matrix(testmatrix),type="raw") print(class(results)) print(typeof(results)) print(results) #resultsColl[index] <- "hello world" resultsColl[index] <- toString(results) index <- index +1 debug(RML,'valueToken') debug(RML,valueToken) #print(valueToken) #debug(as.character(results)) } return (resultsColl)
},
error=function(cond)
{
error(RML,cond)
},
warning=function(cond)
{
warn(RML,cond)
return(cond)
},
finally={
}
)
debug(RML,'end compute naviebyes')
return(out)
}testing
result<- computeNavieByes("D:/axa/TrainNavieByes.csv","suspend suspend,smuggler smuggler","N")
print(result)
Training data
Text Category
laundering laundering laundering Money laundering
tax evasion tax evasion Money laundering
bank fraud Money laundering
terrorist terrorist terrorist terrorist Terrorist Financing
arms arms Terrorist Financing
weapon weapon Terrorist Financing
bribe bribe bribe bribe bribe Bribery and Corruption
corrupt corrupt corrupt Bribery and Corruption
kickback kickback Bribery and Corruption
fraud fraud fraud fraud Fraud and Regulatory Breaches
convict convict Fraud and Regulatory Breaches
breach breach Fraud and Regulatory BreachesThanks
Posts: 1
Participants: 1