top of page

Java Script to Extract SSNs


Tonight, I tested out a Java script posted here by EverMap which is used to extract social security numbers from PDFs. My test set has SSNs listed in the first page of a short document; in a table in another short document; and hidden in a very long document. The script found each SSN.

Enter the below script in Adobe Acrobat by going to Tools . . . Action Wizard and clicking 'Create New Action . . . '. Under 'More Tools', in the 'Choose tools to add' section, click on 'Execute Javascript', then uncheck 'Prompt User' and click on 'Specify Settings' on the right.

Enter the script in the JavaScript Editor . . .

Click OK and save and rename the new action.

Add the files you want to process and then click start. A new PDF will be generated for each source PDF that lists each of their SSNs.

/* Extract US Social Security Numbers From the Document */ // This script will scan all pages of the input document // and extract : // Social security numbers: // Output PDF document will be placed in the same folder // as input. The name of the output document will be: // Original filename + "_Extracted_SSNs" // Visit www.evermap.com for more useful JavaScript samples.

var reMatch=/(?!000)([0-6]\d{2}|7([0-6]\d|7[012]))([ -])(?!00)\d\d\3(?!0000)\d{4}/g;

var strExt = "_Extracted_SSNs.pdf"; var strIntro = "Social Security Numbers extracted from document: "; var strFinal = "Total number of SSNs extracted: " ;

ExtractFromDocument(reMatch,strExt,strIntro,strFinal);

function ExtractFromDocument(reMatch, strFileExt, strMessage1, strMessage2) { var chWord, numWords;

// construct filename for output document var filename = this.path.replace(/\.pdf$/, strFileExt);

// create a report document try { var ReportDoc = new Report(); var Out = new Object(); // array where we will collect all our emails before outputing them ReportDoc.writeText(strMessage1 + this.path); ReportDoc.divide(1); // draw a horizontal divider ReportDoc.writeText(" "); // write a blank line to output for (var i = 0; i < this.numPages; i++) { numWords = this.getPageNumWords(i); var PageText = ""; for (var j = 0; j < numWords; j++) { var word = this.getPageNthWord(i,j,false); PageText += word; } var strMatches = PageText.match(reMatch); if (strMatches == null) continue; // now output matches into report document for (j = 0; j < strMatches.length; j++) { Out[strMatches[j]] = true; // store email as a property name } } var nTotal = 0; for (var prop in Out) { ReportDoc.writeText(prop); nTotal++; } ReportDoc.writeText(" "); // output extra blank line ReportDoc.divide(1); // draw a horizontal divider ReportDoc.writeText(strMessage2 + nTotal); // save report to a document ReportDoc.save( { cDIPath: filename });

} catch(e) { app.alert("Processing error: "+e) } } // end of the function


Sean O'Shea has more than 20 years of experience in the litigation support field with major law firms in New York and San Francisco.   He is an ACEDS Certified eDiscovery Specialist and a Relativity Certified Administrator.

​

The views expressed in this blog are those of the owner and do not reflect the views or opinions of the owner’s employer.

​

If you have a question or comment about this blog, please make a submission using the form to the right. 

Your details were sent successfully!

© 2015 by Sean O'Shea . Proudly created with Wix.com

bottom of page