This class is created in the aim of simplifying the process of converting Microsoft Docx File to an Html Format preserving the CSS. this class has some bugs and is still in development so there will be new features coming soon.

Currently this class supports the following html components

 
                                                            Table with colspan and rowspans
                                                            Headings(h1, h2, h3, h4, h5, h6)
                                                            Colors(background, and foreground)
                                                            Images 
                                                            Superscript 
                                                            Subscript
                                                            Lists(numbered, disc)
                                                            Links
                                                            With Some Basic CSS Properties



I used jquery to remove lists with an empty value. I am working on other features and I will update this class if the feature works

Class Construction

in a Microsoft, Docx file is compressed zip file composed of Binary(Images, Videos) and XML data to decode those data we need to understand all about how it works and how it renders those values to a view.

this class is built to be used so you can call it using its function. let's see the underlying code of this class.

class Docx2Html {
    private $docxFileName;
	
    public function __construct($filePath) {
      $this->docxFileName = $filePath;
    }
}

as you can see the above files is actually the class constructor which is used when we call the class. what this constructor does is it gets the file name and assign to a local variable $docxFileName.


    public function read_docx(){
        $content = "";
        $resources = "";
        $medias = NULL;
        $numberings = "";
        $zip = new ZipArchive();
        if($zip->open($this->docxFileName)) {
          for($i = 0; $i < $zip->numFiles; $i++) {
            $name = $zip->getNameIndex($i);
            if($name == "word/document.xml") {
              $fp = $zip->getStream($name);
              while(!feof($fp)) {
                $content .= fread($fp, 2);
              }
            }
            if($name == "word/_rels/document.xml.rels") {
              $fp = $zip->getStream($name);
              while(!feof($fp)) {
                $resources .= fread($fp, 2);
              }
            }
            if($name == "word/numbering.xml") {
              $fp = $zip->getStream($name);
              while(!feof($fp)) {
                $numberings .= fread($fp, 2);
              }
            }
            if(substr($name,0,11) == "word/media/") {
              $fp = $zip->getStream($name);
              $zip->extractTo("medias",$name);
              $medias .= $name;
            }
          }
        }
      return $this->findTags($content, $resources, $numberings);
    }

the above code functionality is to read the file pass as an argument and read this file as a zip archive then extract these 4 files
1. word/document.xml - Which is the Main Component
2. word/_rels/document.xml.rels - Some Properties is also here
3. word/numbering.xml - which is the one Used for List Types
4. word/media - which holds all the media files like Video, Image and audios


this function is the one which will be called outside the class

	public function convertToHtml() {
		if(isset($this->docxFileName) && !file_exists($this->docxFileName)) {
			return "File Not exists";
		}
		$fileArray = pathinfo($this->docxFileName);
		$file_ext  = $fileArray['extension'];
		if($file_ext == "docx") {
			if($file_ext == "docx") {
				return $this->read_docx();
			}
		} else {
			return "We Only accept docx files at this time";
		}
	}

The below code is actually the main tags of Microsoft Docx file


		//this line finds the opening and closing tags
		$main_tags = array(
			"w:b","w:body","w:p","w:r","w:t","w:pPr","w:rPr","w:sectPr","w:bCs","w:i","w:iCs",
			"w:pStyle","w:u","w:rFonts","w:type","w:docGrid","w:pgSz","w:pgMar","w:pgNumType",
			"w:formProt","w:textDirection","w:sz","w:szCs","w:tab","w:caps","w:smallCaps",
			"w:spacing","w:color","w:highlight","w:ind","w:hanging","w:pageBreakBefore","w:jc",
			"w:firstLine","w:wordWrap","w:pgBorders","w:dstrike","w:sz","w:strike","w:vertAlign","w:shd",
			"w:pBdr","w:hyperlink","w:rStyle","w:drawing","wp:anchor","wp:simplePos","wp:positionH",
			"wp:posOffset","wp:extent","wp:effectExtent","wp:wrapSquare","docPr","wp:positionV",
			"wp:cNvGraphicFramePr","a:graphic","a:graphicData","a:graphicFrameLocks","pic:pic",
			"pic:nvPicPr","pic:cNvPr","a:picLocks","a:blip","pic:blipFill","a:stretch","a:fillRect",
			"a:xfrm","a:off","a:ext","a:prstGeom","pic:spPr","a:avLst","w:numId","w:numPr","wp:align",
			"w:tbl","w:tblPr","w:tblW","w:tblInd","w:tblBorders","w:top","w:left","w:bottom","w:insideH",
			"w:right","tblCellMar","w:tblGrid","w:gridCol","w:tr","w:trPr","w:tc","w:tcPr","w:tcW",
			"w:insideV","w:vMerge","w:gridSpan","w:trHeight"
		);


The below code is an array of attributes that are used in the above tags

		$tags_prop = array(
			"t","b","r","l","w:val","w:ascii","xml:space","w:w","w:h","w:left","w:right","w:header",
			"w:top","w:shd","w:footer","w:bottom","w:left","w:start","w:end","w:sz","w:color","w:space",
			"w:on","w:after","w:before","w:hanging","w:firstLine","cy","cx"
		);


The below code is an array of tags which are only used for styling the components

		$sty_tags = array("w:rFonts","w:b","w:bCs","w:i","w:iCs","w:color","w:u","w:highlight","w:pgSz","w:pgMar",
			"w:pgBorders","w:dstrike","w:sz","w:strike","w:smallCaps","w:caps","w:vertAlign","w:shd",
			"w:pBdr","w:spacing","w:ind","w:hanging","w:pageBreakBefore","w:jc","w:firstLine","w:wordWrap",
			"w:pgNumType","w:formProt","w:textDirection","w:docGrid","w:type","w:pStyle","w:tab","w:rStyle",
			"wp:simplePos","wp:effectExtent","wp:wrapSquare","wp:extent","a:picLocks","a:fillReact",
			"a:off","a:ext","a:avLst","a:fillRect","a:graphicFrameLocks","w:numId","w:tblW","w:tblInd",
			"w:top","w:left","w:bottom","w:insideH","w:right","w:gridCol","w:tcW","w:insideV",
			"w:vMerge","w:gridSpan","w:trHeight"
		);



The below code is used to define default value if we found one of the above tags and also define the appropriate replacement of the CSS attribute.

		$tags_rep = array(
			"w:rFonts" => array("font-family: sans-serif;",
							array("w:ascii" => "font-family: ","w:cs" => "font-family: ")),
			"w:b" => array("font-weight: bold;",array("w:val" => "font-weight: ")),
			"w:bCs" => array("font-weight: bold;",array("w:val" => "font-weight: ")),
			"w:i" => array("font-style: italic;", array("w:val" => "font-style: ")),
			"w:iCs" => array("font-style: italic;",array("w:val" => "font-style: ")),
			"w:color" => array("color: black;", array("w:val" => "color: ")),
			"w:u" => array("text-decoration: none;", array("w:val" => "text-decoration-style: ")),
			"w:highlight" => array("background-color: none", array("w:val" => "background-color: ")),
			"w:pgSz" => array("max-width: none;",array("w:w" => "max:width: ")),
			"w:pgMar" => array("margin: 0px;",
							array("w:top" => "margin-top: ", "w:bottom" => "margin-bottom: ",
								"w:left" => "margin-left: ", "w:right" => "margin-right: "
							)),
			"w:pgBorders" => array("border: 0px;",
								array("w:top" => "border-top: ", "w:bottom" => "border-bottom: ",
									"w:left" => "border-left: ", "w:right" => "border-right: ",
									"w:color" => "border-color: ", "w:sz" => "border-width: ",
									"w:val" => "border-style: ", "w:space" => "border-spacing: "
								)),
			"w:dstrike" => array("text-decoration-style: none;", array("w:val" => "text-decoration-style: ")),
			"w:sz" => array("font-size: auto;", array("w:val" => "font-size: ")),
			"w:szCs" => array("font-size: auto;", array("w:val" => "font-size: ")),
			"w:strike" => array("text-decoration: line-through;", array("w:on" => "text-decoration: ")),
			"w:smallCaps" => array("text-transform: uppercase; font-size: small;",
								 array("w:val" => "font-size: small; text-transform: ")),
			"w:caps" => array("text-transform: uppercase;", array("w:val" => "text-transform: ")),
			"w:vertAlign" => array("vertical-align: sub;", array("w:val" => "vertical-align: ")),
			"w:shd" => array("background-color: white;", array("w:shd" => "background-color: ")),
			"w:pBdr" => array("border: 0px;",
							array("w:top" => "border-top: ", "w:bottom" => "border-bottom: ",
								"w:left" => "border-left: ", "w:right" => "border-right: ",
								"w:color" => "border-color: ", "w:sz" => "border-width: ",
								"w:val" => "border-style: ", "w:space" => "border-spacing: "
							)),
			"w:spacing" => array("margin: 0px;",
								array("w:left" => "margin-left: ", "w:start" => "margin-left: ",
									"w:right" => "margin-right: ", "w:end" => "margin-right: ",
									"w:after" => "margin-bottom: ", "w:before" => "margin-top: ")),
			"w:ind" => array("margin: 0px;",
							array("w:left" => "margin-left: ", "w:start" => "margin-left: ",
								"w:right" => "margin-right: ", "w:end" => "margin-right: ",
								"w:after" => "margin-bottom: ", "w:before" => "margin-top: ")),
			"w:hanging" => array("padding-left: 0px;", array("w:hanging" => "padding-left: ")),
			"w:pageBreakBefore" => array("page-break-before: avoid;", array("w:val" => "page-break-always: ")),
			"w:jc" => array("text-align: left;", array("w:val" => "text-align: ")),
			"w:firstLine" => array("text-indent: unset;", array("w:firstLine" => "text-indent: ")),
			"w:wordWrap" => array("vertical-align: sub;", array("w:val" => "vertical-align: ")),
			"w:pgNumType" => array(" ", array("" => "")),
			"w:formProt" => array(" ", array("" => "")),
			"w:textDirection" => array(" ", array("" => "")),
			"w:docGrid" => array(" ", array("" => "")),
			"w:type" => array(" ", array("" => "")),
			"w:pStyle" => array(" ", array("w:val" => "Heading")),
			"w:tab" => array(" ", array("" => "")),
			"w:rStyle" => array(" ", array("" => "")),
			"w:drawing" => array(" ", array("" => "")),
			"wp:anchor" => array(" ", array("" => "")),
			"wp:simplePos" => array(" ", array("" => "")),
			"wp:positionH" => array(" ", array("" => "")),
			"wp:positionV" => array(" ", array("" => "")),
			"wp:posOffset" => array(" ", array("" => "")),
			"a:picLocks" => array(" ", array("" => "")),
			"a:fillRect" => array(" ", array("" => "")),
			"a:off" => array(" ", array("" => "")),
			"a:ext" => array(" ", array("" => "")),
			"a:avLst" => array(" ", array("" => "")),
			"a:fillReact" => array(" ", array("" => "")),
			"a:graphicFrameLocks" => array(" ", array("" => "")),
			"w:numId" => array("list-style-type: none;", array("w:val" => "list-style-type: ")),
			"wp:extent" => array("height: auto; \n\t\twidth: auto;",
							 array("cy" => "height: ", "cx" => "width: ")),
			"wp:effectExtent" => array("margin: auto;", 
									array("l" => "margin-left: ", "r" => "margin-right: ",
										"b" => "margin-bottom: ", "t" => "margin-top: ")),
			"wp:wrapSquare" => array(" ", array("" => "")),
			"w:tblW" => array(" ", array("w:w" => "width: ", "w:type" => "transform-type: ")),
			"w:tblInd" => array("margin: none",
							 array("w:w" => "margin-left: ", "w:bottomFromText" => "margin-bottom: ",
								"w:topFromText" => "margin-top: ")),
			"w:top" => array(" ", array("w:val" => "border-top: ", "w:color" => "border-top-color: ",
									"w:w" => "padding-top:")),
			"w:left" => array(" ", array("w:val" => "border-left: ", "w:color" => "border-left-color: ",
									"w:w" => "padding-left: ")),
			"w:bottom" => array(" ", array("w:val" => "border-bottom: ", "w:color" => "border-bottom-color: ",
									"w:w" => "padding-bottom: ")),
			"w:insideH" => array(" ", array("w:val" => "border-right: ", "w:color" => "border-right-color: ",
									"w:w" => "padding-right: ")),
			"w:right" => array(" ", array("w:val" => "border-right: ", "w:color" => "border-right-color: ",
									"w:w" => "padding-right: ")),
			"w:gridCol" => array(" ", array("" => "")),
			"w:tcW" => array(" ", array("" => "")),
			"w:insideV" => array(" ", array("" => "")),
			"w:tblBorders" => array("border: none;",
								 array("w:top" => "border-top: ", "w:right" => "border-right: ",
									"w:bottom" => "border-bottom: ", "w:left" => "border-left: ")),
			"w:vMerge" => array(" ", array("w:val" => "row-span: ")),
			"w:gridSpan" => array(" ", array("w:val" => "column-span: ")),
			"w:trHeight" => array(" ", array("w:val" => "height: "))
		);

The below variables are the ones that will be used to compare our results and also to find the opening and closing of our tags.
 

		$head_tags = array("");

		$styles = array();

		$live_tags = array();
		$num_tabs = 0;
		$docx_html = "docx-html";
		$latest_id = -1;
		$list_id = array();
		$list_id_type = array();

		$header_id = array();
		$header_id_type = array();

		$table_id = array();
		$table_row = array();
		$table_column = array();
		
		$row_spans = array(array());
		$row_span_counter = -1;
		$row_span_cur_counter = 0;
		$removable_cols = array();
		$col_spans = array(array());
		$col_span_counter = -1;

		$rows = 0;
		$cols = 0;
		$tables = 0;

		$restart = array();
		$continue = array();


After the above code, I have a long line of code that will find and replace the appropriate tags with their respective ones and also change the attributes we have got.
The below code is to replace all the tags we have gathered with their respective HTML tags. I have used / and instead of < and /- for /> that is because if we do not remove < and /> we can not able to differentiate the XML tags and the HTML tags we have replaced
 
	public function tagReplacer($html_doc) {

		$html_doc = str_replace("","/body/",$html_doc);
		$html_doc = str_replace("","/-body/",$html_doc);
		$html_doc = str_replace("","/-p/",$html_doc);
		$html_doc = str_replace("","/-r/",$html_doc);
		$html_doc = str_replace("","/-t/",$html_doc);
		$html_doc = str_replace("","/-style/",$html_doc);
		$html_doc = str_replace("","/-link/",$html_doc);
		$html_doc = str_replace("","/link/",$html_doc);
		$html_doc = str_replace("",$html_doc);
		$html_doc = str_replace("/-body/","<\/body>",$html_doc);
		$html_doc = str_replace("/p/","<\span id",$html_doc);
		$html_doc = str_replace("/-p/","<\/span><\br>",$html_doc);
		$html_doc = str_replace("/r/","<\span id",$html_doc);
		$html_doc = str_replace("/-r/","<\/span>",$html_doc);
		$html_doc = str_replace("/t/","<\span id",$html_doc);
		$html_doc = str_replace("/-t/","<\/span>",$html_doc);
		$html_doc = str_replace("/style/","<\style>",$html_doc);
		$html_doc = str_replace("/-style/","<\/style>",$html_doc);
		$html_doc = str_replace("/link/","<\a",$html_doc);
		$html_doc = str_replace("/link-/","\>",$html_doc);
		$html_doc = str_replace("/-link/","<\/a>",$html_doc);
		$html_doc = str_replace("/img/","<\img",$html_doc);
		$html_doc = str_replace("/li/","<\li>",$html_doc);
		$html_doc = str_replace("/-li/","<\/li>",$html_doc);
		$html_doc = str_replace("/li-/","<\li ",$html_doc);
		$html_doc = str_replace("/ul/","<\ul>",$html_doc);
		$html_doc = str_replace("/-ul/","<\/ul>",$html_doc);
		$html_doc = str_replace("/ol/","<\ol>",$html_doc);
		$html_doc = str_replace("/-ol/","<\/ol>",$html_doc);
		$html_doc = str_replace("/h1/","<\h1>",$html_doc);
		$html_doc = str_replace("/-h1/","<\/h1>",$html_doc);
		$html_doc = str_replace("/h2/","<\h2>",$html_doc);
		$html_doc = str_replace("/-h2/","<\/h2>",$html_doc);
		$html_doc = str_replace("/h3/","<\h3>",$html_doc);
		$html_doc = str_replace("/-h3/","<\/h3>",$html_doc);
		$html_doc = str_replace("/h4/","<\h4>",$html_doc);
		$html_doc = str_replace("/-h4/","<\/h4>",$html_doc);
		$html_doc = str_replace("/h5/","<\h5>",$html_doc);
		$html_doc = str_replace("/-h5/","<\/h5>",$html_doc);
		$html_doc = str_replace("/h6/","<\h6>",$html_doc);
		$html_doc = str_replace("/-h6/","<\/h6>",$html_doc);
		$html_doc = str_replace("/-table/","<\/table>",$html_doc);
		$html_doc = str_replace("/-row/","<\/tr>",$html_doc);
		$html_doc = str_replace("/-col/","<\/td>",$html_doc);
		$html_doc = str_replace("/table-/","<\table",$html_doc);
		$html_doc = str_replace("/row-/","<\tr",$html_doc);
		$html_doc = str_replace("/col-/","<\td",$html_doc);
		return $html_doc;
	}

and the finaly we return our html value with inline css

	return $html_doc_out_space;

The next step we will do is to call our class Docx2Html.

        require("docx.class.php");

        $docx = new Docx2Html("test.docx");
        $result = $docx->convertToHtml();


Get The Full Source Code