stats-import: Custom edit to import only word frequency

This commit is contained in:
Hamcha 2016-02-14 10:33:28 +01:00
parent 7211de403a
commit 942651dacb
1 changed files with 182 additions and 116 deletions

View File

@ -7,8 +7,7 @@ import (
"flag"
"log"
"os"
"strconv"
"time"
"strings"
"github.com/boltdb/bolt"
)
@ -27,6 +26,8 @@ type Message struct {
Date int `json:"date"`
}
type UserCount map[string]uint64
type Stats struct {
Total uint64
ByUser map[string]uint64
@ -36,6 +37,7 @@ type Stats struct {
Replies uint64
Forward uint64
Username map[string]string
Words map[string]UserCount
}
func assert(err error) {
@ -84,6 +86,7 @@ func main() {
}
func processMessage(msg Message, data *Stats) {
/*
data.Total++
if msg.ReplyID != nil {
@ -113,6 +116,33 @@ func processMessage(msg Message, data *Stats) {
data.ByDate[datekey] = val + 1
data.Username[msg.From.Username] = msg.From.FirstName
*/
if len(msg.Text) > 2 {
wordList := strings.Split(msg.Text, " ")
for _, word := range wordList {
if len(word) < 3 {
continue
}
word = strings.ToLower(word)
if strings.HasPrefix(word, "http") {
continue
}
word = strings.Trim(word, " ?!.,:;/-_()[]{}'\"+=*^\n")
count, ok := data.Words[word]
if !ok {
count = make(UserCount)
}
val, ok := count[msg.From.Username]
if !ok {
val = 0
}
count[msg.From.Username] = val + 1
data.Words[word] = count
}
}
}
func MakeUint(bval []byte, bucketName string, key string) uint64 {
@ -138,6 +168,7 @@ func PutUint(value uint64) []byte {
func update(db *bolt.DB, data Stats) error {
return db.Update(func(tx *bolt.Tx) error {
/*
b, err := tx.CreateBucketIfNotExists([]byte("global"))
if err != nil {
return err
@ -247,6 +278,41 @@ func update(db *bolt.DB, data Stats) error {
}
}
}
*/
// Add word frequency
b, err := tx.CreateBucketIfNotExists([]byte("words"))
if err != nil {
return err
}
for word, freq := range data.Words {
// Sanity check, you never know!
if len(word) < 1 {
continue
}
var count UserCount
val := b.Get([]byte(word))
if val == nil {
// No need to add, just apply the current count
count = freq
} else {
// Deserialize counter and add each user one by one
err = json.Unmarshal(val, &count)
if err != nil {
return err
}
for user, wcount := range freq {
count[user] += wcount
}
}
bval, err := json.Marshal(count)
if err != nil {
return err
}
err = b.Put([]byte(word), bval)
if err != nil {
return err
}
}
return nil
})