Why am I getting an empty DB from mbox?

330 views Asked by At

So I have this code, that takes mbox files and then convert them into sqlite database. However, I am getting an empty database. No matter which mbox I use, the resulting database is 12kb. When I try to view the database/analyze it, it results with nothing, no information or anything, despite that the tables and keys are made. but no information inside it. what could be the problem? is it that the mbox is not been selected by the script, or something wrong inside the loop?

import mailbox
import os
import email
from bs4 import BeautifulSoup
import sqlite3

# Current working directory
cwd = os.path.dirname(os.path.realpath(__file__))

# First find all the mbox files using os.walk
mbox_path = cwd + '/mbox_files'
mbox_files = []
pattern = '*.mbox'

for root, dirs, files in os.walk(mbox_path):
    for filename in fnmatch.filter(files, pattern):
        mbox_files.append((filename, os.path.join(root, filename)))

# Now process each message in the folder
for mbox_file in mbox_files:
    src_mbox = mailbox.mbox(mbox_file[1])
    for msg in src_mbox:
        sender = name_email(msg['From'])
        recipient = name_email(msg['To'])
        b = email.message_from_string(str(msg))
        if b.is_multipart():
            for payload in b.get_payload():
                p = payload.get_payload()
                if isinstance(p,list):
                    html_text = p[0]
                else:
                    html_text = p

        try:
            # Remove any HTML tags, and any inline styles
            soup = BeautifulSoup(str(html_text))
            [s.extract() for s in soup('style')]
            text = soup.text.strip()
        except:
            pass
        # Just in case we get a plain text email

        else:
            text = b.get_payload()
        row = [
                    None,
                    sender[0],
                    sender[1],
                    recipient[0],
                    recipient[1],
                    msg['Subject'],
                    topic,
                    msg['Date'],
                    msg['Message-ID'],
                    text
                ]
    cur.execute("INSERT INTO emails VALUES(?,?,?,?,?,?,?,?,?,?);", row)



# A litle utility function that separates name and email from strings like '"Some Name" <[email protected]>'
def name_email(s):
    if not s:
        return None, None
    pieces = s.split('<')
    if len(pieces) > 1:
        name = pieces[0].replace('"','').strip()
        email = pieces[1].replace('>','')
    else:
        name = None
        email = pieces[0].replace('>','')
    return name,email

conn = sqlite3.connect(cwd + '/test.db')
cur = conn.cursor()

# Create the table. 
cur.execute("DROP TABLE IF EXISTS emails")    
cur.execute("CREATE TABLE emails(id INTEGER PRIMARY KEY, sender_name TEXT, sender_email TEXT, recipient_name TEXT, recipient_email TEXT, subject TEXT, conversation_topic TEXT, message_date TEXT, message_id TEXT, text_body TEXT)")
cur.execute("CREATE INDEX index_sender_name ON emails (sender_name)")

Mbox samples

0

There are 0 answers