Hi all,
I've been developing a Python tool to ingest and write all emails from a
PST exported from Outlook to individual .html files. The issue is that when
opening the PST in outlook and checking the source information for emails
individually, it includes this specific line:
which IS NOT being included when importing the PST with Pywin32 and reading
all the emails in the PST. To see what it looks like in a chunk -
>From Outlook I get:http://schemas.microsoft.com/office/2004/12/omml"; xmlns="
http://www.w3.org/TR/REC-html40";>
What is exported from the tool: http://schemas.microsoft.com/office/2004/12/omml"; xmlns="
http://www.w3.org/TR/REC-html40";>
The contents of the emails are otherwise ENTIRELY identical except for that
one tag.
My code:
-
def find_pst_folder(OutlookObj, pst_filepath):
for Store in OutlookObj.Stores:
if Store.IsDataFileStore and Store.FilePath == pst_filepath:
return Store.GetRootFolder()
return None
def enumerate_folders(FolderObj):
for ChildFolder in FolderObj.Folders:
enumerate_folders(ChildFolder)
iterate_messages(FolderObj)
def iterate_messages(FolderObj):
global mycounter2
global encryptedEmails
global richPlainEmails
global totalEmails
global htmlEmails
for item in FolderObj.Items:
totalEmails += 1
try:
try:
body_content = item.HTMLbody
mysubject = item.Subject
writeToFile(body_content, exportPath, mysubject)
mycounter2 = mycounter2 + 1
htmlEmails += 1
except AttributeError:
#print('Non HTML formatted email, passing')
richPlainEmails += 1
pass
except Exception as e:
encryptedEmails += 1
pass
def writeToFile(messageHTML, path, mysubject):
global mycounter2
filename = '\htmloutput' + str(mycounter2) + '.html'
#Check if email is rich or plain text first (only HTML emails are
desired)
if '' in messageHTML or '' in messageHTML:
raise AttributeError()
else:
file = open(path + filename, "x", encoding='utf-8')
try:
messageHTML = regex.sub('\r\n', '\n', messageHTML)
file.write(messageHTML)
#Handle any potential unexpected Unicode error
except Exception as e:
print('Exception: ' , e)
try:
#Prints email subject to more easily find the offending
email
print('Subject: ', mysubject)
print(mycounter2)
file.write(messageHTML)
except Exception as e:
print('Tried utf decode: ', e)
file.close()
htmlEmails = 0
encryptedEmails = 0
totalEmails = 0
richPlainEmails = 0
filenameCount = 1
mycounter2 = 1
#Adjusting name of PST location to be readable
selectedPST = str(selectedPST.replace('/', '\\'))
print('\nRunning:' , selectedPST)
outlook.AddStore(selectedPST)
PSTFolderObj = find_pst_folder(outlook, selectedPST)
-
Because the emails otherwise are identical, I can only assume this is being
done by the library. I'm wondering if there's a reason that meta tag is
excluded, or if it's a bug in PyWin32?
Thanks for any input,
-Nick
___
python-win32 mailing list
python-win32@python.org
https://mail.python.org/mailman/listinfo/python-win32