kopia lustrzana https://github.com/cblgh/lieu
386 wiersze
9.3 KiB
Go
386 wiersze
9.3 KiB
Go
package database
|
|
|
|
/* example query
|
|
SELECT p.url
|
|
FROM inv_index index
|
|
INNER JOIN pages p ON p.id = index.pageid
|
|
WHERE i.word = "project";
|
|
|
|
select url from inv_index where word="esoteric" group by url order by sum(score) desc limit 15;
|
|
|
|
select url from inv_index where word = "<word>" group by url order by sum(score) desc;
|
|
*/
|
|
|
|
import (
|
|
"database/sql"
|
|
"fmt"
|
|
"lieu/types"
|
|
"lieu/util"
|
|
"log"
|
|
"net/url"
|
|
"strings"
|
|
"regexp"
|
|
|
|
_ "github.com/mattn/go-sqlite3"
|
|
)
|
|
|
|
var languageCodeSanityRegex = regexp.MustCompile("^[a-zA-Z\\-0-9]+$")
|
|
|
|
func InitDB(filepath string) *sql.DB {
|
|
db, err := sql.Open("sqlite3", filepath)
|
|
if err != nil {
|
|
log.Fatalln(err)
|
|
}
|
|
if db == nil {
|
|
log.Fatalln("db is nil")
|
|
}
|
|
createTables(db)
|
|
return db
|
|
}
|
|
|
|
func createTables(db *sql.DB) {
|
|
// create the table if it doesn't exist
|
|
queries := []string{`
|
|
CREATE TABLE IF NOT EXISTS domains (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
domain TEXT NOT NULL UNIQUE
|
|
);
|
|
`,
|
|
`
|
|
CREATE TABLE IF NOT EXISTS stats (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
last_crawl TEXT
|
|
);
|
|
`,
|
|
`
|
|
CREATE TABLE IF NOT EXISTS pages (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
url TEXT NOT NULL UNIQUE,
|
|
title TEXT,
|
|
about TEXT,
|
|
lang TEXT,
|
|
domain TEXT NOT NULL,
|
|
FOREIGN KEY(domain) REFERENCES domains(domain)
|
|
);
|
|
`,
|
|
`
|
|
CREATE TABLE IF NOT EXISTS external_pages (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
url TEXT NOT NULL UNIQUE,
|
|
domain TEXT NOT NULL,
|
|
title TEXT
|
|
);
|
|
`,
|
|
`
|
|
CREATE TABLE IF NOT EXISTS inv_index (
|
|
word TEXT NOT NULL,
|
|
score INTEGER NOT NULL,
|
|
url TEXT NOT NULL,
|
|
FOREIGN KEY(url) REFERENCES pages(url)
|
|
)`,
|
|
`CREATE VIRTUAL TABLE IF NOT EXISTS external_links USING fts5 (url, tokenize="trigram")`,
|
|
}
|
|
|
|
for _, query := range queries {
|
|
if _, err := db.Exec(query); err != nil {
|
|
log.Fatalln(fmt.Errorf("failed to execute %s (%w)", query, err))
|
|
}
|
|
}
|
|
}
|
|
|
|
/* TODO: filters
|
|
lang:en|fr|en|<..>
|
|
nosite:excluded-domain.com
|
|
|
|
"word1 word2 word3" strict query
|
|
|
|
query params:
|
|
&order=score, &order=count
|
|
*/
|
|
|
|
var emptyStringArray = []string{}
|
|
|
|
func SearchWordsByScore(db *sql.DB, words []string) []types.PageData {
|
|
return SearchWords(db, words, true, emptyStringArray, emptyStringArray, emptyStringArray)
|
|
}
|
|
|
|
func SearchWordsBySite(db *sql.DB, words []string, domain string) []types.PageData {
|
|
// search words by site is same as search words by score, but adds a domain condition
|
|
return SearchWords(db, words, true, []string{domain}, emptyStringArray, emptyStringArray)
|
|
}
|
|
|
|
func SearchWordsByCount(db *sql.DB, words []string) []types.PageData {
|
|
return SearchWords(db, words, false, emptyStringArray, emptyStringArray, emptyStringArray)
|
|
}
|
|
|
|
func FulltextSearchWords(db *sql.DB, phrase string) []types.PageData {
|
|
query := fmt.Sprintf(`SELECT url from external_links WHERE url MATCH ? GROUP BY url ORDER BY RANDOM() LIMIT 30`)
|
|
|
|
stmt, err := db.Prepare(query)
|
|
util.Check(err)
|
|
defer stmt.Close()
|
|
|
|
rows, err := stmt.Query(phrase)
|
|
util.Check(err)
|
|
defer rows.Close()
|
|
|
|
var pageData types.PageData
|
|
var pages []types.PageData
|
|
for rows.Next() {
|
|
if err := rows.Scan(&pageData.URL); err != nil {
|
|
log.Fatalln(err)
|
|
}
|
|
pageData.Title = pageData.URL
|
|
pages = append(pages, pageData)
|
|
}
|
|
return pages
|
|
}
|
|
|
|
func UpdateCrawlDate(db *sql.DB, date string) {
|
|
stmt := `INSERT OR IGNORE INTO stats(last_crawl) VALUES (?)`
|
|
_, err := db.Exec(stmt, date)
|
|
if err != nil {
|
|
util.Check(fmt.Errorf("failed to update crawl date (%w)", err))
|
|
}
|
|
}
|
|
|
|
func GetLastCrawl(db *sql.DB) string {
|
|
rows, err := db.Query("SELECT last_crawl FROM stats WHERE last_crawl IS NOT NULL ORDER BY id DESC LIMIT 1")
|
|
util.Check(err)
|
|
defer rows.Close()
|
|
|
|
var date string
|
|
for rows.Next() {
|
|
err = rows.Scan(&date)
|
|
if err != nil {
|
|
util.Check(fmt.Errorf("failed to get last crawl (%w)", err))
|
|
}
|
|
}
|
|
return date
|
|
}
|
|
|
|
func GetDomainCount(db *sql.DB) int {
|
|
return countQuery(db, "domains")
|
|
}
|
|
|
|
func GetPageCount(db *sql.DB) int {
|
|
return countQuery(db, "pages")
|
|
}
|
|
|
|
func GetWordCount(db *sql.DB) int {
|
|
return countQuery(db, "inv_index")
|
|
}
|
|
|
|
func GetRandomDomain(db *sql.DB) string {
|
|
rows, err := db.Query("SELECT domain FROM domains ORDER BY RANDOM() LIMIT 1;")
|
|
util.Check(err)
|
|
defer rows.Close()
|
|
|
|
var domain string
|
|
for rows.Next() {
|
|
err = rows.Scan(&domain)
|
|
util.Check(err)
|
|
}
|
|
return domain
|
|
}
|
|
|
|
func GetRandomExternalLink(db *sql.DB) string {
|
|
rows, err := db.Query("SELECT url FROM external_links ORDER BY RANDOM() LIMIT 1;")
|
|
util.Check(err)
|
|
defer rows.Close()
|
|
|
|
var link string
|
|
for rows.Next() {
|
|
err = rows.Scan(&link)
|
|
util.Check(err)
|
|
}
|
|
return link
|
|
}
|
|
|
|
func GetRandomPage(db *sql.DB) string {
|
|
domain := GetRandomDomain(db)
|
|
stmt, err := db.Prepare("SELECT url FROM pages WHERE domain = ? ORDER BY RANDOM() LIMIT 1;")
|
|
defer stmt.Close()
|
|
util.Check(err)
|
|
|
|
rows, err := stmt.Query(domain)
|
|
defer rows.Close()
|
|
|
|
var link string
|
|
for rows.Next() {
|
|
err = rows.Scan(&link)
|
|
util.Check(err)
|
|
}
|
|
return link
|
|
}
|
|
|
|
func countQuery(db *sql.DB, table string) int {
|
|
rows, err := db.Query(fmt.Sprintf("SELECT COUNT(*) FROM %s;", table))
|
|
util.Check(err)
|
|
defer rows.Close()
|
|
|
|
var count int
|
|
for rows.Next() {
|
|
err = rows.Scan(&count)
|
|
util.Check(err)
|
|
}
|
|
return count
|
|
}
|
|
|
|
func SearchWords(db *sql.DB, words []string, searchByScore bool, domain []string, nodomain []string, language []string) []types.PageData {
|
|
var args []interface{}
|
|
|
|
wordlist := []string{"1"}
|
|
if len(words) > 0 && words[0] != "" {
|
|
wordlist = make([]string, 0)
|
|
for _, word := range words {
|
|
wordlist = append(wordlist, "word = ?")
|
|
args = append(args, strings.ToLower(word))
|
|
}
|
|
}
|
|
|
|
// the domains conditional defaults to just 'true' i.e. no domain condition
|
|
domains := []string{"1"}
|
|
if len(domain) > 0 && domain[0] != "" {
|
|
domains = make([]string, 0) // we've got at least one domain! clear domains default
|
|
for _, d := range domain {
|
|
domains = append(domains, "domain = ?")
|
|
args = append(args, d)
|
|
}
|
|
}
|
|
|
|
nodomains := []string{"1"}
|
|
if len(nodomain) > 0 && nodomain[0] != "" {
|
|
nodomains = make([]string, 0)
|
|
for _, d := range nodomain {
|
|
nodomains = append(nodomains, "domain != ?")
|
|
args = append(args, d)
|
|
}
|
|
}
|
|
|
|
//This needs some wildcard support …
|
|
languages := []string{"1"}
|
|
if len(language) > 0 && language[0] != "" {
|
|
languages = make([]string, 0)
|
|
for _, d := range language {
|
|
// Do a little check to avoid the database being DOSed
|
|
if languageCodeSanityRegex.MatchString(d) {
|
|
languages = append(languages, "lang LIKE ?")
|
|
args = append(args, d+"%")
|
|
}
|
|
}
|
|
}
|
|
|
|
orderType := "SUM(score)"
|
|
if !searchByScore {
|
|
orderType = "COUNT(*)"
|
|
}
|
|
|
|
query := fmt.Sprintf(`
|
|
SELECT p.url, p.about, p.title
|
|
FROM inv_index inv INNER JOIN pages p ON inv.url = p.url
|
|
WHERE (%s)
|
|
AND (%s)
|
|
AND (%s)
|
|
AND (%s)
|
|
GROUP BY inv.url
|
|
ORDER BY %s
|
|
DESC
|
|
LIMIT 15
|
|
`, strings.Join(wordlist, " OR "), strings.Join(domains, " OR "), strings.Join(nodomains, " AND "), strings.Join(languages, " OR "), orderType)
|
|
|
|
stmt, err := db.Prepare(query)
|
|
util.Check(err)
|
|
defer stmt.Close()
|
|
|
|
rows, err := stmt.Query(args...)
|
|
util.Check(err)
|
|
defer rows.Close()
|
|
|
|
var pageData types.PageData
|
|
var pages []types.PageData
|
|
for rows.Next() {
|
|
if err := rows.Scan(&pageData.URL, &pageData.About, &pageData.Title); err != nil {
|
|
log.Fatalln(err)
|
|
}
|
|
pages = append(pages, pageData)
|
|
}
|
|
return pages
|
|
}
|
|
|
|
func InsertManyDomains(db *sql.DB, pages []types.PageData) {
|
|
if len(pages) == 0 {
|
|
return
|
|
}
|
|
values := make([]string, 0, len(pages))
|
|
args := make([]interface{}, 0, len(pages))
|
|
|
|
for _, b := range pages {
|
|
values = append(values, "(?)")
|
|
u, err := url.Parse(b.URL)
|
|
util.Check(err)
|
|
args = append(args, u.Hostname())
|
|
}
|
|
|
|
stmt := fmt.Sprintf(`INSERT OR IGNORE INTO domains(domain) VALUES %s`, strings.Join(values, ","))
|
|
_, err := db.Exec(stmt, args...)
|
|
util.Check(err)
|
|
}
|
|
|
|
func InsertManyPages(db *sql.DB, pages []types.PageData) {
|
|
if len(pages) == 0 {
|
|
return
|
|
}
|
|
values := make([]string, 0, len(pages))
|
|
args := make([]interface{}, 0, len(pages))
|
|
|
|
for _, b := range pages {
|
|
// url, title, lang, about, domain
|
|
values = append(values, "(?, ?, ?, ?, ?)")
|
|
u, err := url.Parse(b.URL)
|
|
util.Check(err)
|
|
args = append(args, b.URL, b.Title, b.Lang, b.About, u.Hostname())
|
|
}
|
|
|
|
stmt := fmt.Sprintf(`INSERT OR IGNORE INTO pages(url, title, lang, about, domain) VALUES %s`, strings.Join(values, ","))
|
|
_, err := db.Exec(stmt, args...)
|
|
util.Check(err)
|
|
}
|
|
|
|
func InsertManyWords(db *sql.DB, batch []types.SearchFragment) {
|
|
if len(batch) == 0 {
|
|
return
|
|
}
|
|
|
|
values := make([]string, 0, len(batch))
|
|
args := make([]interface{}, 0, len(batch))
|
|
|
|
for _, b := range batch {
|
|
pageurl := strings.TrimSuffix(b.URL, "/")
|
|
values = append(values, "(?, ?, ?)")
|
|
args = append(args, b.Word, pageurl, b.Score)
|
|
}
|
|
|
|
stmt := fmt.Sprintf(`INSERT OR IGNORE INTO inv_index(word, url, score) VALUES %s`, strings.Join(values, ","))
|
|
_, err := db.Exec(stmt, args...)
|
|
util.Check(err)
|
|
}
|
|
|
|
func InsertManyExternalLinks(db *sql.DB, externalLinks []string) {
|
|
if len(externalLinks) == 0 {
|
|
return
|
|
}
|
|
|
|
values := make([]string, 0, len(externalLinks))
|
|
args := make([]interface{}, 0, len(externalLinks))
|
|
|
|
for _, externalLink := range externalLinks {
|
|
values = append(values, "(?)")
|
|
args = append(args, externalLink)
|
|
}
|
|
|
|
stmt := fmt.Sprintf(`INSERT OR IGNORE INTO external_links(url) VALUES %s`, strings.Join(values, ","))
|
|
_, err := db.Exec(stmt, args...)
|
|
util.Check(err)
|
|
}
|