Final Project - Patent Portfolio Evaluator¶

This project will read in three different patent portfolios, delete any expired patents, and then evaluate the patents based on age, number of citations, and number of claims. If two patents have the same number of citations and number of claims, then the patent with the longest remaining term is ranked more valuable. For this particular project, I selected AMD's patent portfolio, as well as a subset of Google and Yahoo's portfolios as well for three different examples.

Using a local mysql database was taking too long to get the data, so I used the API from here http://www.patentsview.org/api/doc.html to generate my datasets. I took advantage of the wrapper for that API from here https://github.com/CSSIP-AIR/PatentsView-APIWrapper in order to do the queries and have the data converted from a json to a csv file for the input of my analysis.

import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
import numpy as np
import datetime
from datetime import datetime

# Fudge factors to manipulate the results.  I played around with these, and ultimately settled on these values.
citation_score_factor = 2.0
claim_score_factor = 0.25
date_score_factor = 0.5

# Data files to work on
AMD_data_file = 'Patent_Portfolios - AMD.csv'
Google_data_file = 'Patent_Portfolios - Google.csv'
Yahoo_data_file = 'Patent_Portfolios - Yahoo.csv'

# Custom functions

# Boolean predicate to determine if a patent has already expired
def isExpired(patent_date):
    # Add 20 years to the patent_date and then compare it with today
    temp_date1=datetime.strptime(patent_date, '%Y-%m-%d')
    temp_date1=temp_date1.replace(year=temp_date1.year+20)
    return(temp_date1 < datetime.now())  

# Function to flag expired patents to exclude them in the analyssi
def markExpired(patents):
    # Iterate through the dataframe and add a boolean column indicating whehter a patent has expired
    for i, row in patents.iterrows():
        if isExpired(row['patent_date']):
            patents.set_value(i, 'expired', 'True')
        else:
            patents.set_value(i, 'expired', 'False')
    return patents

# Function to only get the non-expired patents for the analysis
def getNonExpired(patents):
    # Get a dataframe with only unexpired patents
    patents_sample_df = patents[patents["expired"] == "False"]
    return patents_sample_df

# Function to score the patent claims based on the quantile it falls in
def scoreClaims(patents):
# Score the patents based on which quantile their number of claims fall into.  This value will be saved as the patents claim score and will be part of the overall score later. 
    sorted_patents=patents.sort_values(by='patent_num_claims', ascending=False, kind='mergesort')
    quantiles = sorted_patents['patent_num_claims'].quantile([.05,.1,.15,.2,.25,.3,.35,.4,.45,.5,.55,.6,.65,.7,.75,.8,.85,.9,.95,1])

    for i, row in sorted_patents.iterrows():
        if row['patent_num_claims'] <= quantiles[.05]:
            sorted_patents.set_value(i, 'claim_score', .05)
        elif row['patent_num_claims'] <= quantiles[.1]:
            sorted_patents.set_value(i, 'claim_score', .1)
        elif row['patent_num_claims'] <= quantiles[.15]:
            sorted_patents.set_value(i, 'claim_score', .15)
        elif row['patent_num_claims'] <= quantiles[.2]:
            sorted_patents.set_value(i, 'claim_score', .2)
        elif row['patent_num_claims'] <= quantiles[.25]:
            sorted_patents.set_value(i, 'claim_score', .25)
        elif row['patent_num_claims'] <= quantiles[.3]:
            sorted_patents.set_value(i, 'claim_score', .35)
        elif row['patent_num_claims'] <= quantiles[.4]:
            sorted_patents.set_value(i, 'claim_score', .4)
        elif row['patent_num_claims'] <= quantiles[.45]:
            sorted_patents.set_value(i, 'claim_score', .45)
        elif row['patent_num_claims'] <= quantiles[.5]:
            sorted_patents.set_value(i, 'claim_score', .5)
        elif row['patent_num_claims'] <= quantiles[.55]:
            sorted_patents.set_value(i, 'claim_score', .55)
        elif row['patent_num_claims'] <= quantiles[.6]:
            sorted_patents.set_value(i, 'claim_score', .6)
        elif row['patent_num_claims'] <= quantiles[.65]:
            sorted_patents.set_value(i, 'claim_score', .65)
        elif row['patent_num_claims'] <= quantiles[.7]:
            sorted_patents.set_value(i, 'claim_score', .7)
        elif row['patent_num_claims'] <= quantiles[.75]:
            sorted_patents.set_value(i, 'claim_score', .75)
        elif row['patent_num_claims'] <= quantiles[.8]:
            sorted_patents.set_value(i, 'claim_score', .8)
        elif row['patent_num_claims'] <= quantiles[.85]:
            sorted_patents.set_value(i, 'claim_score', .85)
        elif row['patent_num_claims'] <= quantiles[.9]:
            sorted_patents.set_value(i, 'claim_score', .9)
        elif row['patent_num_claims'] <= quantiles[.95]:
            sorted_patents.set_value(i, 'claim_score', .95)
        elif row['patent_num_claims'] < quantiles[1]:
            sorted_patents.set_value(i, 'claim_score', 1)
        else:
            # Give a higher value for the maximum of the set
            sorted_patents.set_value(i, 'claim_score', 1.10)
        
    return sorted_patents
    
# Function to score the patent citations based on the quantile they fall in
def scoreCitations(patents):
# Score the patents based on which quantile their number of citations fall into.  This value will be saved as the patents citation score and will be part of the overall score later. 
    sorted_patents=patents.sort_values(by='patent_num_combined_citations', ascending=False, kind='mergesort')
    quantiles = sorted_patents['patent_num_combined_citations'].quantile([.05,.1,.15,.2,.25,.3,.35,.4,.45,.5,.55,.6,.65,.7,.75,.8,.85,.9,.95,1])

    for i, row in sorted_patents.iterrows():
        if row['patent_num_combined_citations'] <= quantiles[.05]:
            sorted_patents.set_value(i, 'citation_score', .05)
        elif row['patent_num_combined_citations'] <= quantiles[.1]:
            sorted_patents.set_value(i, 'citation_score', .1)
        elif row['patent_num_combined_citations'] <= quantiles[.15]:
            sorted_patents.set_value(i, 'citation_score', .15)
        elif row['patent_num_combined_citations'] <= quantiles[.2]:
            sorted_patents.set_value(i, 'citation_score', .2)
        elif row['patent_num_combined_citations'] <= quantiles[.25]:
            sorted_patents.set_value(i, 'citation_score', .25)
        elif row['patent_num_combined_citations'] <= quantiles[.3]:
            sorted_patents.set_value(i, 'citation_score', .35)
        elif row['patent_num_combined_citations'] <= quantiles[.4]:
            sorted_patents.set_value(i, 'citation_score', .4)
        elif row['patent_num_combined_citations'] <= quantiles[.45]:
            sorted_patents.set_value(i, 'citation_score', .45)
        elif row['patent_num_combined_citations'] <= quantiles[.5]:
            sorted_patents.set_value(i, 'citation_score', .5)
        elif row['patent_num_combined_citations'] <= quantiles[.55]:
            sorted_patents.set_value(i, 'citation_score', .55)
        elif row['patent_num_combined_citations'] <= quantiles[.6]:
            sorted_patents.set_value(i, 'citation_score', .6)
        elif row['patent_num_combined_citations'] <= quantiles[.65]:
            sorted_patents.set_value(i, 'citation_score', .65)
        elif row['patent_num_combined_citations'] <= quantiles[.7]:
            sorted_patents.set_value(i, 'citation_score', .7)
        elif row['patent_num_combined_citations'] <= quantiles[.75]:
            sorted_patents.set_value(i, 'citation_score', .75)
        elif row['patent_num_combined_citations'] <= quantiles[.8]:
            sorted_patents.set_value(i, 'citation_score', .8)
        elif row['patent_num_combined_citations'] <= quantiles[.85]:
            sorted_patents.set_value(i, 'citation_score', .85)
        elif row['patent_num_combined_citations'] <= quantiles[.9]:
            sorted_patents.set_value(i, 'citation_score', .9)
        elif row['patent_num_combined_citations'] <= quantiles[.95]:
            sorted_patents.set_value(i, 'citation_score', .95)
        elif row['patent_num_combined_citations'] < quantiles[1]:
            sorted_patents.set_value(i, 'citation_score', 1)
        else:
            # Give a higher value for the maximum of the set
            sorted_patents.set_value(i, 'citation_score', 1.10)
        
    return sorted_patents

# Function to score the patents based on their dates.  Slightly different than the previous two function because I couldn't get the quantiles to work with dates
def scoreDates(patents):
# Score the patents based on how much time remained in their term.  This value will be saved as the patent's date score and will be part of the overall score later. 
    sorted_patents=patents.sort_values(by='patent_date', ascending=False, kind='mergesort')

    # I couldn't get quantiles to work with dates, so I'm scoring dates bases on how much time is left in its' term.  Anything less than two years is nearly worthless for enforcement.
    two_years = datetime.now()
    two_years = two_years.replace(year=two_years.year-18)
    five_years = datetime.now()
    five_years = five_years.replace(year=five_years.year-15)
    ten_years = datetime.now()
    ten_years = ten_years.replace(year=ten_years.year-10)
    fifteen_years = datetime.now()
    fifteen_years = fifteen_years.replace(year=fifteen_years.year-5)
    
    for i, row in sorted_patents.iterrows():
        temp_date1= datetime.strptime(row['patent_date'], '%Y-%m-%d')
        if temp_date1 <= two_years:
            sorted_patents.set_value(i, 'date_score', .2)
        elif temp_date1 <= five_years:
            sorted_patents.set_value(i, 'date_score', .4)
        elif temp_date1 <= ten_years:
            sorted_patents.set_value(i, 'date_score', .6)
        elif temp_date1 <= fifteen_years:
            sorted_patents.set_value(i, 'date_score', .8)
        else:
            sorted_patents.set_value(i, 'date_score', 1)
        
        
    return sorted_patents

# Function to use all the other utility functions to score the patents and output them to a spreadsheet
def scorePatents(patents):
    # Mark the expired patents
    patents = markExpired(patents)
    
    # Get the nonexpired patents
    scored_patents = getNonExpired(patents)
    
    # Score the claims
    scored_patents = scoreClaims(scored_patents)
    
    # Score the citations
    scored_patents = scoreCitations(scored_patents)
    
    # Score the Dates
    scored_patents = scoreDates(scored_patents)
    
    # Incorporate the score factors set earlier and add up the results
    for i, row in scored_patents.iterrows():
        total_score = (row['citation_score']*citation_score_factor)+(row['claim_score']*claim_score_factor)+(row['date_score']*date_score_factor)
        scored_patents.set_value(i, 'patent_score', total_score)
    
    sorted_data=scored_patents.sort_values(by='patent_score', ascending=False, kind='mergesort')
    return sorted_data

# Utility function to write the results to an excel spreadsheet
def writeToExcel(AMD_df, Google_df, Yahoo_df):
    writer = ExcelWriter('patent_results.xlsx')
    AMD_df.to_excel(writer, sheet_name='AMD')
    Google_df.to_excel(writer, sheet_name='Google')
    Yahoo_df.to_excel(writer, sheet_name='Yahoo')
    writer.save()
    writer.close()

Data Loading¶

We are going to load in the data for each portfolio, and then display the head of each file. The input files are .csv files.

# Load and peek at the AMD data.
unsorted_AMD_data = pd.read_csv(AMD_data_file) 
unsorted_AMD_data.head()

# Load and peek at the Google data.
unsorted_Google_data = pd.read_csv(Google_data_file) 
unsorted_Google_data.head()

# Load and peek at the Yahoo data.
unsorted_Yahoo_data = pd.read_csv(Yahoo_data_file) 
unsorted_Yahoo_data.head()

Data Scoring¶

We are going to score each portfolio and output them to an excel spreadsheet. Each portfolio will be written to a different sheet within the workbook.

We are using the following three different scoring criteria:

patent_num_combined_citations - a measure of how many other patent applications have referenced the patent patent_num_claims - a measure of the total number of claims in the patent patent_date - a measure of how much time remains in the patent term.

As discussed during my presentation, patent_num_combined_citations is traditionally the most accurate factor in determining a patent's value. Hence why I weighted it far heavier than the other two criteria. I included the other criteria because (1) if I only used patent_num_combined_citations then this would simply be an exercise of sorting on that field and (2) the other criteria are important in determining a patent's worth. For example, a patent with ten years remaining would be far more valuable than a patent with two years remaining all other things being equal.

Initially, I wanted to evaluate the length of the first independent claim because the shorter the claim is usually an indication that it is easier to infringe the claim. This is where my issues with MySQL came into play. It was taking more than two minutes per query to retrieve each first independent claim for each patent. With the size of the portfolios, this wasn't practical, so I will leave that for future work. I did start to play around with a GPU powered database called 'mapd,' which promised much faster response time, but I wasn't very familiar with it, and I might spend far too much time just configuring it and not working on this project.

The 'scorePatents' function goes through a portfolio and scores the three criteria I discussed above. I run this on each of the dataframes I have already created.

unsorted_AMD_data = scorePatents(unsorted_AMD_data)
unsorted_AMD_data.head()

unsorted_Google_data = scorePatents(unsorted_Google_data)
unsorted_Google_data.head()

unsorted_Yahoo_data = scorePatents(unsorted_Yahoo_data)
unsorted_Yahoo_data.head()

Final Results¶

Write the final results out to the excel file. Each portfolio will be written to its own worksheet. The scored patent portfolios will be in a file called 'patent_results.xlsx.'

Initially, I wanted to use Google sheets, but talking with the partner I was working with, we decided to use excel because everyone in our group is more familiar with excel.

writeToExcel(unsorted_AMD_data, unsorted_Google_data, unsorted_Yahoo_data)

User Feedback¶

I asked several people in my group for feedback from this solution. One suggestion that I received and incorporated into this project is to use the date criterion as one of the scoring factors. I think this can be refined a bit more, but in our particular group we find ourselves on the plaintiff/complainant side of things and having at least a two year term remaining on a patent allows a district court and International Trade Commission play as well.

Additionally, users requested some visualization plots, but that is beyond the initial scope of this project and I have added them to the Future Work section.

In addition, I have presented the final project to my partner, and he has expressed that it meets his expectations. The email correspondence can be found in this p4 directory as 'McNamra_Email.pdf.'

Future Work¶

In the future, I would like to incorporate the GPU database I discussed earlier. Additionally, I'd like to evaluate the patents based on the length of the first independent claim. Finally, I'd like to add some visualizations to help convey how the ranking works.

	patent_number	patent_title	patent_date	patent_num_claims	patent_num_combined_citations	inventor_last_name	inventor_first_name	assignee_organization
0	3986045	High speed logic level converter	1976-10-12	11	3	Lutz	Robert C.	Advanced Micro Devices, Inc.
1	4042950	Platinum silicide fuse links for integrated ci...	1977-08-16	4	4	Price	William L.	Advanced Micro Devices, Inc.
2	4079308	Resistor ratio circuit construction	1978-03-14	10	4	Brown	George W.	Advanced Micro Devices, Inc.
3	4110842	Random access memory with memory status for im...	1978-08-29	11	9	Sarkissian	Vahe Andre	Advanced Micro Devices, Inc.
4	4135295	Process of making platinum silicide fuse links...	1979-01-23	2	5	Price	William L.	Advanced Micro Devices, Inc.

	patent_number	patent_title	patent_date	patent_num_claims	patent_num_combined_citations	inventor_last_name	inventor_first_name	assignee_organization
0	7783639	Methods and apparatus for increasing efficienc...	2008-04-22	26	19	Page	Lawrence E.	Google Inc.
1	7788245	Using boundaries associated with a map view fo...	2008-05-13	14	17	O'Clair	Brian	Google Inc.
2	7788258	Method and apparatus for event modeling	2008-05-13	23	6	Nye	Jeff	Google Inc.
3	7788421	Support for object search	2008-11-18	29	7	Kehlenbeck	Alexander P.	Google Inc.
4	7801738	Support for object search	2008-11-18	29	7	Hogue	Andrew	Google Inc.

	patent_number	patent_title	patent_date	patent_num_claims	patent_num_combined_citations	inventor_last_name	inventor_first_name	assignee_organization
0	5983227	Dynamic page generator	1999-11-09	9	9	Patel	Ashvinkumar P.	Yahoo! Inc.
1	6216157	Dynamic page generator	1999-11-09	9	9	Nazem	Farzad	Yahoo! Inc.
2	6343302	Method and apparatus for a client-server syste...	2001-04-10	13	29	Mendhekar	Anurag D.	Yahoo! Inc.
3	6446040	Method and apparatus for a client-server syste...	2001-04-10	13	29	Vishwanath	Mohan	Yahoo! Inc.
4	6523069	Remote web site authoring system and method	2002-01-29	12	8	Graham	Paul Andrew	Yahoo! Inc.

	patent_number	patent_title	patent_date	patent_num_claims	patent_num_combined_citations	inventor_last_name	inventor_first_name	assignee_organization	expired	claim_score	citation_score	date_score	patent_score
10614	9378560	Real time on-chip texture decompression using ...	2016-06-28	20	191	Brothers, III	John W.	Advanced Micro Devices, Inc.	False	0.55	1.1	1.0	2.8375
10774	9697147	Stacked memory device with metadata management	2017-07-04	40	59	O'Connor	James	Advanced Micro Devices, Inc.	False	1.00	1.0	1.0	2.7500
10503	9253287	Speculation based approach for reliable messag...	2016-02-02	41	42	Mayhew	David E.	Advanced Micro Devices, Inc.	False	1.00	1.0	1.0	2.7500
10190	8805981	Computing system fabric and routing configurat...	2014-08-12	45	61	Mattress	Michael V.	Advanced Micro Devices, Inc.	False	1.00	1.0	1.0	2.7500
10060	8631212	Input/output memory management unit with prote...	2014-01-14	33	63	Huang	Wei Je	Advanced Micro Devices, Inc.	False	0.95	1.0	1.0	2.7375

	patent_number	patent_title	patent_date	patent_num_claims	patent_num_combined_citations	inventor_last_name	inventor_first_name	assignee_organization	expired	claim_score	citation_score	date_score	patent_score
549	9123330	Archive of text captures from rendered documents	2013-08-06	20	1145	Mannby	Claes-Fredrik	Google Inc.	False	0.4	1.1	1.0	2.8
550	9124631	Archive of text captures from rendered documents	2013-08-06	20	1145	King	Martin T.	Google Inc.	False	0.4	1.1	1.0	2.8
551	9124664	Archive of text captures from rendered documents	2013-08-06	20	1145	Grover	Dale L.	Google Inc.	False	0.4	1.1	1.0	2.8
552	9128285	Archive of text captures from rendered documents	2013-08-06	20	1145	Kushler	Clifford A.	Google Inc.	False	0.4	1.1	1.0	2.8
553	9129016	Archive of text captures from rendered documents	2013-08-06	20	1145	Stafford-Fraser	James Q.	Google Inc.	False	0.4	1.1	1.0	2.8

	patent_number	patent_title	patent_date	patent_num_claims	patent_num_combined_citations	inventor_last_name	inventor_first_name	assignee_organization	expired	claim_score	citation_score	date_score	patent_score
140	7707127	Online playback system with community bias	2008-11-18	44	296	Beaupre	Todd M.	Yahoo! Inc.	False	1.0	1.1	0.8	2.850
141	7707198	Online playback system with community bias	2008-11-18	44	296	Boulter	Jeffrey R.	Yahoo! Inc.	False	1.0	1.1	0.8	2.850
142	7707201	Online playback system with community bias	2008-11-18	44	296	Veilleux	John-Paul	Yahoo! Inc.	False	1.0	1.1	0.8	2.850
345	8045695	System and method for enhanced media distribution	2010-03-23	88	236	Rogers	Ian C.	Yahoo! Inc.	False	1.1	1.0	0.8	2.675
402	8135704	Method and system of controlling a context menu	2010-05-18	61	191	Wada	Alan	Yahoo! Inc.	False	1.0	1.0	0.8	2.650