python code for vcard duplicate removal in vcf file works with vobject but only for "exact duplicates"

949 views Asked by At
#!/usr/bin/env python2.7 

import vobject

abinfile='/foo/bar/dir/infile.vcf' #ab stands for address book  

aboutfile='/foo/bar/dir/outfile.vcf'  

def eliminate_vcard_duplicates (abinfile, aboutfile):

    #we first convert the Adrees Book IN FILE into a list

    with open(abinfile) as source_file:
        ablist = list(vobject.readComponents(source_file))

    #then add each vcard from that list in a new list unless it's already there

    ablist_norepeats=[]
    ablist_norepeats.append(ablist[0])

    for i in range(1, len(ablist)):
        jay=len(ablist_norepeats)
        for j in reversed(range(0, jay)): #we do reversed because usually cards have duplicates nearby
            if ablist_norepeats[j].serialize() == ablist[i].serialize():
                break
            else:
                jay += -1
        if jay == 0:
            ablist_norepeats.append(ablist[i])

    #and finally write the singularized list to the Adrees Book OUT FILE

    with open(aboutfile, 'w') as destination_file:
        for j in range(0, len(ablist_norepeats)):
            destination_file.write(ablist_norepeats[j].serialize)

eliminate_vcard_duplicates(abinfile, aboutfile)

The above code works and creates a new file where there are no exact duplicates (duplicates with identical singularizations). I know the code has some efficiency issues: it's n square, when it could be n*log n; we could serialize each vacard only once; inefficient use of for etc. Here I wanted to provide a short code to illustrate one of the issues I don't know how to solve.

The issue that I'm not sure how to solve elegantly is this one: If some of the fields in the cards are scrambled it will not detect they are equal. Is there a way to detect such duplicates either with vobject, re, or another approach?

The file contents used in the test, with four equal vcards (phones scrambled messes up code - not email scrambled thought), is this one:

BEGIN:VCARD
VERSION:3.0
FN:Foo_bar1
N:;Foo_bar1;;;
EMAIL;TYPE=INTERNET:[email protected]
TEL;TYPE=CELL:123456789
TEL;TYPE=CELL:987654321
END:VCARD
BEGIN:VCARD
VERSION:3.0
FN:Foo_bar1
N:;Foo_bar1;;;
EMAIL;TYPE=INTERNET:[email protected]
TEL;TYPE=CELL:123456789
TEL;TYPE=CELL:987654321
END:VCARD
BEGIN:VCARD
VERSION:3.0
FN:Foo_bar1
N:;Foo_bar1;;;
TEL;TYPE=CELL:123456789
TEL;TYPE=CELL:987654321
EMAIL;TYPE=INTERNET:[email protected]
END:VCARD
BEGIN:VCARD
VERSION:3.0
FN:Foo_bar1
N:;Foo_bar1;;;
TEL;TYPE=CELL:987654321
TEL;TYPE=CELL:123456789
EMAIL;TYPE=INTERNET:[email protected]
END:VCARD

The above code will not detect that the four are all the same because the last one has the phone numbers scrambled.

As bonus points, if someone has a faster algorithm it would be great if it can be shared. The above one takes days on a 30.000 Vcard file...

3

There are 3 answers

0
Brian Barcelona On

The following is a faster code (about three orders of magnitude) but still does only remove exact duplicates...

    #!/usr/bin/env python2.7 

    import vobject
    import datetime

    abinfile='/foo/bar/dir/infile.vcf' #ab stands for address book  

    aboutfile='/foo/bar/dir/outfile.vcf' 

    def eliminate_vcard_duplicatesv2(abinfile, aboutfile):

        #we first convert the Adrees Book IN FILE into a list
        ablist=[]
        with open(abinfile) as source_file:
            ablist = list(vobject.readComponents(source_file))

        #we then serialize the list to expedite comparison process
        ablist_serial=[]
        for i in range(0, len(ablist)):
            ablist_serial.append(ablist[i].serialize())

        #then add each unique vcard's position from that list in a new list unless it's already there
        ablist_singletons=[]
        duplicates=0
        for i in range(1, len(ablist_serial)):
            if i % 1000 == 0:
                print "COMPUTED CARD:", i, "Number of duplicates: ", duplicates, "Current time:", datetime.datetime.now().time()
            jay=len(ablist_singletons)
            for j in reversed(range(0, jay)): #we do reversed because usually cards have duplicates nearby
                if ablist_serial[ablist_singletons[j]] == ablist_serial[i]:
                    duplicates += 1
                    break
                else:
                    jay += -1
            if jay == 0:
                ablist_singletons.append(i)

        print "Length of Original Vcard File: ", len(ablist)
        print "Length of Singleton Vcard File: ", len(ablist_singletons)
        print "Generating Singleton Vcard file and storing it in: ", aboutfile

        #and finally write the singularized list to the Adrees Book OUT FILE
        with open(aboutfile, 'w') as destination_file:
            for k in range(0, len(ablist_singletons)):
                destination_file.write(ablist_serial[ablist_singletons[k]])

    eliminate_vcard_duplicatesv2(abinfile, aboutfile)
0
Aly Niichi On

A variation on Anthon's answer, using class decorators.

import vobject
from vobject.base import Component, ContentLine


def sortedContents(cls):
    def getSortedChildren(self):
        return [obj for k in self.sortChildKeys() for obj in sorted(self.contents[k])]

    cls.getSortedChildren = getSortedChildren
    return cls


def sortableContent(cls):
    def __lt__(self, other):
        return str(self) < str(other)

    def __eq__(self, other):
        return str(self) == str(other)

    cls.__lt__ = __lt__
    cls.__eq__ = __eq__
    return cls


Component = sortedContents(Component)
ContentLine = sortableContent(ContentLine)


addresses = set()

with open('infile.vcf') as infile:
    for vcard in vobject.readComponents(infile):
        addresses.add(vcard.serialize())

with open('outfile.vcf', 'wb') as outfile:
    for address in addresses:
        outfile.write(bytes(address, 'UTF-8'))
0
Anthon On

One thing you might have noticed is that if you call the .serialize() method then EMAIL is sorted before FN. But unfortunately the telefonenumbers are not sorted. If they were, you could add the serialized individual components to a set, and let the unique hashes sort out the multiple occurences.

If you investigate what you get from the generator vobject.readComponents() (e.g. using type()), you'll see that that is a Component from the module vobject.base, and using dir() on an instance you see a method getSortedChildren(). If you look that up in the source, you'll find:

def getSortedChildren(self):
    return [obj for k in self.sortChildKeys() for obj in self.contents[k]]

and sortChildKeys() a directly above that:

def sortChildKeys(self):
    try:
        first = [s for s in self.behavior.sortFirst if s in self.contents]
    except Exception:
        first = []
    return first + sorted(k for k in self.contents.keys() if k not in first)

calling sortChildKeys() on your example instances gives ['version', 'email', 'fn', 'n', 'tel'], which leads to two conclusions:

  • sortFirst causes version to be at the front
  • for obj in self.contents[k] is not sorted therefore your TEL entries are not sorted.

The solution seems to be that you redefine getSortedChildren() to:

    return [obj for k in self.sortChildKeys() for obj in sorted(self.contents[k])]

but that leads to:

TypeError: '<' not supported between instances of 'ContentLine' and 'ContentLine'

so you need to provide some basic comparison operations for ContentLine which is also defined in vobject.base as well:

import vobject

from vobject.base import Component, ContentLine

def gsc(self):
    return [obj for k in self.sortChildKeys() for obj in sorted(self.contents[k])]

Component.getSortedChildren = gsc

def ltContentLine(self, other):
    return str(self) < str(other)

def eqContentLine(self, other):
    return str(self) == str(other)

ContentLine.__lt__ = ltContentLine
ContentLine.__eq__ = eqContentLine


addresses = set()
with open('infile.vcf') as fp:
  for vcard in vobject.readComponents(fp):
     # print(type(vcard))
     # print(dir(vcard))
     # print(vcard.sortChildKeys())
     # print(vcard.contents.keys())
     addresses.add(vcard.serialize())

with open('outfile.vcf', 'w') as fp:
    for a in addresses:
        fp.write(a)

# and check
with open('outfile.vcf') as fp:
    print(fp.read(), end="")

which gives:

BEGIN:VCARD
VERSION:3.0
EMAIL;TYPE=INTERNET:[email protected]
FN:Foo_bar1
N:;Foo_bar1;;;
TEL;TYPE=CELL:123456789
TEL;TYPE=CELL:987654321
END:VCARD