#50_140825_R_TWITTER 분석

728x90

# TWITTER 분석

n 환경구성

1. 아래 URL 접속

http://apps.twitter.com/app/new

- 아래의 3가지 항목을 입력하고 동의한 후 create 클릭

2. api key 확인

- 아래의 API key와 API secret 를 잘 저장해둔다

3. 아래의 명령어 순서대로 수행

> install.packages("twitteR")

> install.packages("plyr")

> install.packages("stringr")

> install.packages("ggplot2")

> library(twitteR)

> library(ROAuth)

> library(plyr)

> library(stringr)

> library(ggplot2)

> download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.pem")

> requestURL <- "https://api.twitter.com/oauth/request_token"

> accessURL <- "https://api.twitter.com/oauth/access_token"

> authURL <- https://api.twitter.com/oauth/authorize

> APIKey <- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"

> APIsecret <- "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

- 좀전에 기억해둔 apikey와 apisecret를 적어서 위 쌍 따옴표 사이에 적어서 수행한다

> Cred <- OAuthFactory$new(consumerKey=APIKey,

consumerSecret=APIsecret,

requestURL=requestURL,

accessURL=accessURL,

authURL=authURL)

> Cred$handshake(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl"))

To enable the connection, please direct your web browser to:

https://api.twitter.com/oauth/authorize?oauth_token=i5R6YnhthPM7DIknM9ktNujhBmaYsrsrnOIXprR9A

When complete, record the PIN given to you and provide it here: 9406075

- 위에 URL을 들어가서 앱인증을 누르면 번호가 하나 나오는데 그 번호를 입력한다

> save(Cred, file="twitter authentication.Rdata")

> registerTwitterOAuth(Cred)

[1] TRUE

- TRUE면 정상동작을 의미한다

4. 원하는 검색어를 트위터에서 검색하여 csv 파일로 생성

- 류현진 : http://me2.do/xuYFVAEl

> Dodgers.list <- searchTwitter('Dodgers', lang="en", n=500, cainfo="cacert.pem")

> Dodgers.df = twListToDF(Dodgers.list)

> write.csv(Dodgers.df, file='DodgersTweets.csv', row.names=F)

- 추신수 : http://me2.do/FCTZaOgw

> Rangers.list <- searchTwitter('Rangers', lang="en", n=500, cainfo="cacert.pem")

> Rangers.df = twListToDF(Rangers.list)

> write.csv(Rangers.df, file='RangersTweets.csv', row.names=F)

- 윤석민 : http://me2.do/G0apHk1x

> ioles.list <- searchTwitter('Orioles', lang="en", n=500, cainfo="cacert.pem")

> Orioles.df = twListToDF(Orioles.list)

> write.csv(Orioles.df, file='OriolesTweets.csv', row.names=F)

5. 함수 생성후 긍정과 부정 비교

n 함수 생성

score.sentiment = function(sentences, pos.words, neg.words, .progress='none')

{

require(plyr)

require(stringr)

# we got a vector of sentences.

# plyr will handle a list or a vector as an "l" for us

# we want a simple array ("a") of scores back,

# so we use "l" + "a" + "ply" = "laply":

scores = laply(sentences, function(sentence, pos.words, neg.words) {

# clean up sentences with R's regex-driven global substitute, gsub():

sentence = gsub('[[:punct:]]', '', sentence) # gsub : http://me2.do/xzZFxvHu

sentence = gsub('[[:cntrl:]]', '', sentence)

sentence = gsub('\\d+', '', sentence)

# and convert to lower case:

sentence = tolower(sentence)

# split into words. str_split is in the stringr package

word.list = str_split(sentence, '\\s+')

# sometimes a list() is one level of hierarchy too much

words = unlist(word.list)

# compare our words to the dictionaries of positive & negative terms

pos.matches = match(words, pos.words) # match : http://me2.do/FHBclNZX

neg.matches = match(words, neg.words)

# match() returns the position of the matched term or NA

# we just want a TRUE/FALSE:

pos.matches = !is.na(pos.matches)

neg.matches = !is.na(neg.matches)

# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():

score = sum(pos.matches) - sum(neg.matches)

return(score)

}, pos.words, neg.words, .progress=.progress )

scores.df = data.frame(score=scores, text=sentences)

return(scores.df)

}

n 긍정 부정 단어와 csv 파일 불러오기

# Load sentiment word lists

> hu.liu.pos <- scan("positive-words.txt", what='character', comment.char=';')

Read 2006 items

> hu.liu.neg <- scan("negative-words.txt", what='character', comment.char=';')

Read 4783 items

# Add words to list

> pos.words <- c(hu.liu.pos, 'upgrade')

> neg.words <- c(hu.liu.neg, 'wtf', 'wait','waiting', 'epicfail', 'mechanical')

# Import 3 csv

> DatasetDodgers <- read.csv("DodgersTweets.csv")

> DatasetDodgers$text <- as.factor(DatasetDodgers$text)

> DatasetRangers <- read.csv("RangersTweets.csv")

> DatasetRangers$text <- as.factor(DatasetRangers$text)

> DatasetOrioles <- read.csv("OriolesTweets.csv")

> DatasetOrioles$text <- as.factor(DatasetOrioles$text)

# Score all tweets

> Dodgers.scores <- score.sentiment(DatasetDodgers$text, pos.words,neg.words, .progress='text')

> Rangers.scores <- score.sentiment(DatasetRangers$text, pos.words,neg.words, .progress='text')

> Orioles.scores <- score.sentiment(DatasetOrioles$text, pos.words,neg.words, .progress='text')

> write.csv(Dodgers.scores,file='DodgersScores.csv',row.names=TRUE)

> write.csv(Rangers.scores,file='RangersScores.csv',row.names=TRUE)

> write.csv(Orioles.scores,file='OriolesScores.csv',row.names=TRUE)

> Dodgers.scores$Team <- 'LA Dodgers'

> Rangers.scores$Team <- 'Texas Rangers'

> Orioles.scores$Team <- 'Baltimore Orioles'

6. 긍정 부정 비교 그래프 생성

> hist(Rangers.scores$score, col="bisque")

> qplot(Rangers.scores$score)

> all.scores <- rbind(Rangers.scores, Dodgers.scores, Orioles.scores)

> ggplot(data=all.scores) +

geom_bar(mapping=aes(x=score, fill=Team), binwidth=1) +

facet_grid(Team~.) +

theme_bw() +

scale_fill_brewer()

n LindsayLohan 긍정과 부정 비교

> LindsayLohan.list <- searchTwitter('LindsayLohan', lang="en", n=500, cainfo="cacert.pem")

> LindsayLohan.df = twListToDF(LindsayLohan.list)

> write.csv(samsung.df, file='LindsayLohan.csv', row.names=F)

> LindsayLohan <- read.csv("LindsayLohan.csv")

> LindsayLohan$text <- as.factor(LindsayLohan$text)

> LindsayLohan.scores <- score.sentiment(LindsayLohan$text, pos.words,neg.words, .progress='text')

> write.csv(LindsayLohan.scores,file=' LindsayLohan.csv',row.names=TRUE)

> LindsayLohan.scores$Team <- 'LindsayLohan'

> hist(LindsayLohan.scores$score, col="bisque")

n 트위터 해킹에서 한글깨짐 현상 해결

1. bios 모드 > Virtualization Technology > Enabled 모드로 설정

2. 오라클 vmware 가져오기를 이용해서 오라클 이미지를 가져온다

3. 트위터 원하는 키워드로 csv파일 생성

4. ‘내폴더’ 로 들어가서 생성된 csv파일을 열고 다른이름으로 저장한 후 아래와 ‘모든 포맷’을 클릭한 후 xml 이나 xls 로 확장자를 바꿔 생성한다

5. 우분투에서 메일이나 usb를 이용해서 윈도우로 생성된 파일을 가져오고 비교 작업을 수행

저작자표시 비영리 변경금지

'빅데이터과정 > R' 카테고리의 다른 글

#50_140825_R_이상치, 중앙값, 상한, 하한 (0)	2014.08.26
#50_140825_R_PIE CAHRT (0)	2014.08.26
#50_140825_R_움직이는 GRAPH (0)	2014.08.25
#50_140825_R_AUDIO GRAPH (0)	2014.08.25
#49_140822_R_LINE CHART (0)	2014.08.22

#50_140825_R_TWITTER 분석

# TWITTER 분석

'빅데이터과정 > R' 카테고리의 다른 글

'빅데이터과정/R' Related Articles

티스토리툴바